diff --git a/tb_plugins/profiling/.github/workflows/libkineto_ci.yml b/tb_plugins/profiling/.github/workflows/libkineto_ci.yml new file mode 100644 index 0000000000000000000000000000000000000000..3133d6400fb0b3ca0ee9b38c311c2db6d1167c7e --- /dev/null +++ b/tb_plugins/profiling/.github/workflows/libkineto_ci.yml @@ -0,0 +1,56 @@ +name: LIBKINETOCI + +on: + push: + branches: + - main + pull_request: + branches: + - main + +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest] + + steps: + - uses: actions/checkout@v2 + - name: Checkout submodules + shell: bash + run: | + auth_header="$(git config --local --get http.https://github.com/.extraheader)" + git submodule sync --recursive + git -c "http.extraheader=$auth_header" -c protocol.version=2 submodule update --init --force --recursive --depth=1 + + - name: Get env vars + run: | + echo GITHUB_WORKFLOW = $GITHUB_WORKFLOW + echo HOME = $HOME + echo GITHUB_ACTION = $GITHUB_ACTION + echo GITHUB_ACTIONS = $GITHUB_ACTIONS + echo GITHUB_REPOSITORY = $GITHUB_REPOSITORY + echo GITHUB_EVENT_NAME = $GITHUB_EVENT_NAME + echo GITHUB_EVENT_PATH = $GITHUB_EVENT_PATH + echo GITHUB_WORKSPACE = $GITHUB_WORKSPACE + echo GITHUB_SHA = $GITHUB_SHA + echo GITHUB_REF = $GITHUB_REF + c++ --verbose + + # TODO: Figure out how to install cupti headers T84637671 + - name: Build static lib + run: | + set -e + mkdir build_static + cd build_static + cmake -DKINETO_LIBRARY_TYPE=static ../libkineto/ + make -j + + - name: Build shared lib + run: | + set -e + mkdir build_shared + cd build_shared + cmake -DKINETO_LIBRARY_TYPE=shared ../libkineto/ + make -j diff --git a/tb_plugins/profiling/.github/workflows/tb_plugin_build_pip_package.yml b/tb_plugins/profiling/.github/workflows/tb_plugin_build_pip_package.yml new file mode 100644 index 0000000000000000000000000000000000000000..9bdafcc442635eaff19fc7a7505f5231cf6e5cf7 --- /dev/null +++ b/tb_plugins/profiling/.github/workflows/tb_plugin_build_pip_package.yml @@ -0,0 +1,19 @@ +name: Build torch-tb-profiler Pip Package + +on: + # TODO: Add an on_release trigger to build on tags + workflow_dispatch: + +jobs: + build-package: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: build pip package + run: | + set -e + cd tb_plugin + python setup.py sdist bdist_wheel + cd dist/ + pip install *.whl + python -c "import torch_tb_profiler;print(torch_tb_profiler.__version__)" diff --git a/tb_plugins/profiling/.github/workflows/tb_plugin_ci.yml b/tb_plugins/profiling/.github/workflows/tb_plugin_ci.yml new file mode 100644 index 0000000000000000000000000000000000000000..1b59a7bf90a6009caa41d4ac0e3d5545dc8b6c7c --- /dev/null +++ b/tb_plugins/profiling/.github/workflows/tb_plugin_ci.yml @@ -0,0 +1,57 @@ +name: TB_Plugin_CI + +on: + push: + branches: + - main + - release/** + - plugin/** + + pull_request: + branches: + - main + - release/** + - plugin/** + +jobs: + generate-matrix: + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + steps: + - id: set-matrix + run: | + echo $GITHUB_BASE_REF + if [ $GITHUB_BASE_REF == "plugin/vnext" ] + then + echo "::set-output name=matrix::{\"python-version\":[3.7, 3.8, 3.9], \"cuda-version\":[\"cpu\"], \"pytorch-version\":[\"nightly\"]}" + else + echo "::set-output name=matrix::{\"python-version\":[3.7, 3.8, 3.9], \"cuda-version\":[\"cpu\"], \"pytorch-version\":[\"nightly\", \"1.11rc\", \"stable\"]}" + fi + + build: + needs: generate-matrix + runs-on: ubuntu-latest + strategy: + matrix: ${{fromJSON(needs.generate-matrix.outputs.matrix)}} + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + architecture: 'x64' + - name: Test + env: + CUDA_VERSION: ${{ matrix.cuda-version }} + PYTORCH_VERSION: ${{ matrix.pytorch-version }} + TORCH_PROFILER_LOG_LEVEL: DEBUG + GRPC_VERBOSITY: DEBUG + GRPC_ENABLE_FORK_SUPPORT: 'False' + run: | + set -e + cd tb_plugin + sh ./ci_scripts/install_env.sh + pip install .[gs] + cd test + pytest diff --git a/tb_plugins/profiling/.gitignore b/tb_plugins/profiling/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..ce186381c0b566e0ca225be70cbf8ac233d7aa6b --- /dev/null +++ b/tb_plugins/profiling/.gitignore @@ -0,0 +1,3 @@ +# ignore common items +.idea +.vscode diff --git a/tb_plugins/profiling/.gitmodules b/tb_plugins/profiling/.gitmodules new file mode 100644 index 0000000000000000000000000000000000000000..4660ee8bc9e6a4be4f4fbb007b8e66058122d716 --- /dev/null +++ b/tb_plugins/profiling/.gitmodules @@ -0,0 +1,6 @@ +[submodule "libkineto/third_party/googletest"] + path = libkineto/third_party/googletest + url = https://github.com/google/googletest.git +[submodule "libkineto/third_party/fmt"] + path = libkineto/third_party/fmt + url = https://github.com/fmtlib/fmt.git diff --git a/tb_plugins/profiling/CODE_OF_CONDUCT.md b/tb_plugins/profiling/CODE_OF_CONDUCT.md new file mode 100644 index 0000000000000000000000000000000000000000..a0cbeaab7650bf08267fbdbc9bb54e845c88f392 --- /dev/null +++ b/tb_plugins/profiling/CODE_OF_CONDUCT.md @@ -0,0 +1,77 @@ +# Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as +contributors and maintainers pledge to make participation in our project and +our community a harassment-free experience for everyone, regardless of age, body +size, disability, ethnicity, sex characteristics, gender identity and expression, +level of experience, education, socio-economic status, nationality, personal +appearance, race, religion, or sexual identity and orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment +include: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery and unwelcome sexual attention or + advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic + address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable +behavior and are expected to take appropriate and fair corrective action in +response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or +reject comments, commits, code, wiki edits, issues, and other contributions +that are not aligned to this Code of Conduct, or to ban temporarily or +permanently any contributor for other behaviors that they deem inappropriate, +threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies within all project spaces, and it also applies when +an individual is representing the project or its community in public spaces. +Examples of representing a project or community include using an official +project e-mail address, posting via an official social media account, or acting +as an appointed representative at an online or offline event. Representation of +a project may be further defined and clarified by project maintainers. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported by contacting the project team at . All +complaints will be reviewed and investigated and will result in a response that +is deemed necessary and appropriate to the circumstances. The project team is +obligated to maintain confidentiality with regard to the reporter of an incident. +Further details of specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good +faith may face temporary or permanent repercussions as determined by other +members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, +available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html + +[homepage]: https://www.contributor-covenant.org + +For answers to common questions about this code of conduct, see +https://www.contributor-covenant.org/faq + diff --git a/tb_plugins/profiling/CONTRIBUTING.md b/tb_plugins/profiling/CONTRIBUTING.md new file mode 100644 index 0000000000000000000000000000000000000000..a2e931bb6f0cc82ff030cee10ee1c99fbbbda07b --- /dev/null +++ b/tb_plugins/profiling/CONTRIBUTING.md @@ -0,0 +1,34 @@ +# Contributing to Kineto +We want to make contributing to this project as easy and transparent as +possible. + +## Code of Conduct +The code of conduct is described in [`CODE_OF_CONDUCT.md`](CODE_OF_CONDUCT.md). + +## Pull Requests +We actively welcome your pull requests. + +1. Fork the repo and create your branch from `main`. +2. If you've added code that should be tested, add tests. +3. If you've changed APIs, update the documentation. +4. Ensure the test suite passes. +5. Make sure your code lints. +6. If you haven't already, complete the Contributor License Agreement ("CLA"). + +## Contributor License Agreement ("CLA") +In order to accept your pull request, we need you to submit a CLA. You only need +to do this once to work on any of Facebook's open source projects. + +Complete your CLA here: + +## Issues +We use GitHub issues to track public bugs. Please ensure your description is +clear and has sufficient instructions to be able to reproduce the issue. + +Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe +disclosure of security bugs. In those cases, please go through the process +outlined on that page and do not file a public issue. + +## License +By contributing to Kineto, you agree that your contributions will be licensed +under the LICENSE file in the root directory of this source tree. diff --git a/tb_plugins/profiling/LICENSE b/tb_plugins/profiling/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..edb179715b5213644cfe903d43294f54892e707e --- /dev/null +++ b/tb_plugins/profiling/LICENSE @@ -0,0 +1,33 @@ +BSD License + +For Kineto software + +Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. + +All contributions by Microsoft: +Copyright (c) Microsoft Corporation. (The Azure AI Platform team) + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + * Neither the name Facebook nor the names of its contributors may be used to + endorse or promote products derived from this software without specific + prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/tb_plugins/profiling/README.md b/tb_plugins/profiling/README.md new file mode 100644 index 0000000000000000000000000000000000000000..0353b1b4df0003778aebe890bf7a17e4d556c000 --- /dev/null +++ b/tb_plugins/profiling/README.md @@ -0,0 +1,38 @@ +# Kineto + +Kineto is part of the PyTorch Profiler. + +The Kineto project was started to help enable +- **performance observability and diagnostics** across common ML bottleneck components +- **actionable recommendations** for common issues +- integration of external system-level profiling tools +- integration with popular visualization platforms and analysis pipelines + +A central component is libkineto, a profiling library with special focus on low-overhead GPU timeline tracing. + +The PyTorch Profiler TensorBoard plugin provides powerful and intuitive visualizations of profiling results, as well as actionable recommendations, and is the best way to experience the new PyTorch Profiler. + +## Libkineto +Libkineto is an in-process profiling library integrated with the PyTorch Profiler. Please refer to the [README](libkineto/README.md) file in the `libkineto` folder as well as documentation on the [new PyTorch Profiler API](https://pytorch.org/docs/master/profiler.html). + +## PyTorch TensorBoard Profiler +The goal of the PyTorch TensorBoard Profiler is to provide a seamless and intuitive end-to-end profiling experience, including straightforward collection from PyTorch and insightful visualizations and recommendations in the TensorBoard UI. +Please refer to the [README](tb_plugin/README.md) file in the `tb_plugin` folder. + +## Future Development Direction: +Some areas we're currently working on: +- Support for tracing distributed workloads +- Trace processing, analysis and recommendation engine +- System-level activities, multiple tracing sources +- Profiling and monitoring daemon for larger scale deployments + +## Releases and Contributing +We will follow the PyTorch release schedule which roughly happens on a 3 month basis. + +We appreciate all contributions. If you are planning to contribute back bug-fixes, please do so without any further discussion. + +If you plan to contribute new features, please first open an issue and discuss the feature with us. Sending a PR without discussion might end up resulting in a rejected PR because we might be taking the infrastructure in a different direction than you might be aware of. We expect the architecture to keep evolving. + +## License +Kineto has a BSD-style license, as found in the [LICENSE](LICENSE) file. + diff --git a/tb_plugins/profiling/libkineto/CMakeLists.txt b/tb_plugins/profiling/libkineto/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..63966de803a786913b104419776aa94bb00b74b0 --- /dev/null +++ b/tb_plugins/profiling/libkineto/CMakeLists.txt @@ -0,0 +1,198 @@ +cmake_minimum_required(VERSION 3.5 FATAL_ERROR) + +list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules") + +#install libraries into correct locations on all platforms +include(GNUInstallDirs) + +# function to extract filelists from libkineto_defs.bzl file +find_package(PythonInterp) +function(get_filelist name outputvar) + execute_process( + COMMAND "${PYTHON_EXECUTABLE}" -c + "exec(open('libkineto_defs.bzl').read());print(';'.join(${name}))" + WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" + OUTPUT_VARIABLE _tempvar) + string(REPLACE "\n" "" _tempvar "${_tempvar}") + set(${outputvar} ${_tempvar} PARENT_SCOPE) +endfunction() + +project(kineto VERSION 0.1 LANGUAGES CXX C) + +set(KINETO_LIBRARY_TYPE "default" CACHE STRING + "Type of library (default, static or shared) to build") +set_property(CACHE KINETO_LIBRARY_TYPE PROPERTY STRINGS default shared) +option(KINETO_BUILD_TESTS "Build kineto unit tests" ON) + +set(LIBKINETO_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/src") +set(LIBKINETO_INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/include") +set(LIBKINETO_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) +set(LIBKINETO_THIRDPARTY_DIR "${CMAKE_CURRENT_SOURCE_DIR}/third_party") +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +#We should default to a Release build +if (NOT CMAKE_BUILD_TYPE OR CMAKE_BUILD_TYPE STREQUAL "") + set(CMAKE_BUILD_TYPE "Release" CACHE STRING "" FORCE) +endif() + +if (NOT CUDA_SOURCE_DIR) + set(CUDA_SOURCE_DIR "$ENV{CUDA_SOURCE_DIR}") + message(INFO " CUDA_SOURCE_DIR = ${CUDA_SOURCE_DIR}") +endif() + +if (NOT ROCM_SOURCE_DIR) + set(ROCM_SOURCE_DIR "$ENV{ROCM_SOURCE_DIR}") + message(INFO " ROCM_SOURCE_DIR = ${ROCM_SOURCE_DIR}") +endif() + +# Set LIBKINETO_NOCUPTI to explicitly disable CUPTI +# Otherwise, CUPTI is disabled if not found +IF (NOT CUDA_SOURCE_DIR OR NOT CUPTI_INCLUDE_DIR OR NOT CUDA_cupti_LIBRARY) + set(LIBKINETO_NOCUPTI ON CACHE BOOL "" FORCE) +endif() + +IF (NOT ROCM_SOURCE_DIR AND NOT ROCTRACER_INCLUDE_DIR) + set(LIBKINETO_NOROCTRACER ON CACHE BOOL "" FORCE) +endif() + +# Define file lists +if (LIBKINETO_NOCUPTI AND LIBKINETO_NOROCTRACER) + get_filelist("get_libkineto_cpu_only_srcs(with_api=False)" LIBKINETO_SRCS) + message(INFO " CUPTI unavailable or disabled - not building GPU profilers") +elseif(NOT LIBKINETO_NOROCTRACER) + get_filelist("get_libkineto_roctracer_srcs()" LIBKINETO_SRCS) + message(INFO " Building with roctracer") +else() + get_filelist("get_libkineto_cupti_srcs(with_api=False)" LIBKINETO_SRCS) +endif() +get_filelist("get_libkineto_public_headers()" LIBKINETO_PUBLIC_HEADERS) +get_filelist("get_libkineto_api_srcs()" LIBKINETO_API_SRCS) + +add_library(kineto_base OBJECT ${LIBKINETO_SRCS}) +add_library(kineto_api OBJECT ${LIBKINETO_API_SRCS}) + +# Make libraries depend on libkineto_defs.bzl +add_custom_target(libkineto_defs.bzl DEPENDS libkineto_defs.bzl) +add_dependencies(kineto_base libkineto_defs.bzl) + +set_target_properties(kineto_base kineto_api PROPERTIES + CXX_STANDARD 14 + CXX_STANDARD_REQUIRED YES + CXX_EXTENSIONS NO + CXX_VISIBILITY_PRESET hidden) + +set(KINETO_COMPILE_OPTIONS "-DKINETO_NAMESPACE=libkineto") +list(APPEND KINETO_COMPILE_OPTIONS "-DFMT_HEADER_ONLY") +if(NOT MSVC) + list(APPEND KINETO_COMPILE_OPTIONS "-std=c++14") +else() + list(APPEND KINETO_COMPILE_OPTIONS "/std:c++14") + list(APPEND KINETO_COMPILE_OPTIONS "-DWIN32_LEAN_AND_MEAN") + list(APPEND KINETO_COMPILE_OPTIONS "-DNOGDI") +endif() +if (NOT LIBKINETO_NOCUPTI) + list(APPEND KINETO_COMPILE_OPTIONS "-DHAS_CUPTI") +endif() +if (NOT LIBKINETO_NOROCTRACER) + target_compile_options(kineto_base PRIVATE "-DHAS_ROCTRACER") + target_compile_options(kineto_base PRIVATE "-D__HIP_PLATFORM_HCC__") + target_compile_options(kineto_base PRIVATE "-D__HIP_PLATFORM_AMD__") +endif() + +target_compile_options(kineto_base PRIVATE "${KINETO_COMPILE_OPTIONS}") +target_compile_options(kineto_api PRIVATE "${KINETO_COMPILE_OPTIONS}") + +if(NOT TARGET fmt) + if(NOT FMT_SOURCE_DIR) + set(FMT_SOURCE_DIR "${LIBKINETO_THIRDPARTY_DIR}/fmt" + CACHE STRING "fmt source directory from submodules") + endif() + + # Build FMT. + # FMT and some other libraries use BUILD_SHARED_LIBS to control + # the library type. + # Save and restore the value after configuring FMT + set(TEMP_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}) + set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libs" FORCE) + set(FMT_LIBRARY_TYPE static CACHE STRING "Set lib type to static") + add_subdirectory("${FMT_SOURCE_DIR}" "${LIBKINETO_BINARY_DIR}/fmt") + set_property(TARGET fmt PROPERTY POSITION_INDEPENDENT_CODE ON) + set(BUILD_SHARED_LIBS ${TEMP_BUILD_SHARED_LIBS} CACHE BOOL "Build shared libs" FORCE) +endif() + +set(FMT_INCLUDE_DIR "${FMT_SOURCE_DIR}/include") +message(STATUS "Kineto: FMT_SOURCE_DIR = ${FMT_SOURCE_DIR}") +message(STATUS "Kineto: FMT_INCLUDE_DIR = ${FMT_INCLUDE_DIR}") +if (NOT CUPTI_INCLUDE_DIR) + set(CUPTI_INCLUDE_DIR "${CUDA_SOURCE_DIR}/extras/CUPTI/include") +endif() +if (NOT CUDA_INCLUDE_DIRS) + set(CUDA_INCLUDE_DIRS "${CUDA_SOURCE_DIR}/include") +endif() +if (NOT ROCTRACER_INCLUDE_DIR) + set(ROCTRACER_INCLUDE_DIR "${ROCM_SOURCE_DIR}/roctracer/include") +endif() +if (NOT ROCM_INCLUDE_DIRS) + set(ROCM_INCLUDE_DIRS "${ROCM_SOURCE_DIR}/include") +endif() + +message(INFO " CUPTI_INCLUDE_DIR = ${CUPTI_INCLUDE_DIR}") +message(INFO " ROCTRACER_INCLUDE_DIR = ${ROCTRACER_INCLUDE_DIR}") + +target_include_directories(kineto_base PUBLIC + $ + $ + $ + $ + $ + $ + $) + +target_include_directories(kineto_api PUBLIC + $ + $) + +if(KINETO_LIBRARY_TYPE STREQUAL "default") + add_library(kineto + $ + $) +elseif(KINETO_LIBRARY_TYPE STREQUAL "static") + add_library(kineto STATIC + $ + $) +elseif(KINETO_LIBRARY_TYPE STREQUAL "shared") + add_library(kineto SHARED + $) + set_property(TARGET kineto_base PROPERTY POSITION_INDEPENDENT_CODE ON) + set_target_properties(kineto PROPERTIES + CXX_VISIBILITY_PRESET hidden) +else() + message(FATAL_ERROR "Unsupported library type ${KINETO_LIBRARY_TYPE}") +endif() + +if(NOT LIBKINETO_NOROCTRACER) + find_library(ROCTRACER_LIBRARY NAMES libroctracer64.so HINTS /opt/rocm/roctracer/lib) + target_link_libraries(kineto "${ROCTRACER_LIBRARY}") + find_library(KINETO_HIP_LIBRARY NAMES libamdhip64.so HINTS /opt/rocm/lib) + target_link_libraries(kineto "${KINETO_HIP_LIBRARY}") +endif() + +if(NOT LIBKINETO_NOCUPTI) + target_link_libraries(kineto "${CUDA_cupti_LIBRARY}") +endif() +target_link_libraries(kineto $) +add_dependencies(kineto fmt::fmt-header-only) + +install(TARGETS kineto EXPORT kinetoLibraryConfig + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}) + +install(FILES ${LIBKINETO_PUBLIC_HEADERS} + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/kineto") + +install(EXPORT kinetoLibraryConfig DESTINATION share/cmake/kineto + FILE kinetoLibraryConfig.cmake) + +if(KINETO_BUILD_TESTS) + add_subdirectory(test) +endif() diff --git a/tb_plugins/profiling/libkineto/README.md b/tb_plugins/profiling/libkineto/README.md new file mode 100644 index 0000000000000000000000000000000000000000..37127ca5aa821217da48aad38cb82eb36f8735c2 --- /dev/null +++ b/tb_plugins/profiling/libkineto/README.md @@ -0,0 +1,65 @@ +# Libkineto + +Libkineto is an in-process profiling library, part of the Kineto performance +tools project. + +The library provides a way to collect GPU traces and metrics from the host +process, either via the library public API or by sending a signal, if enabled. + +Currently only NVIDIA GPUs are supported. + +## Build Notes +Libkineto uses the standard CMAKE-based build flow. + +### Dependencies +Libkineto requires gcc 5+ and: + +- NVIDIA CUPTI: used to collect traces and metrics from NVIDIA GPUs. +- fmt: used for its convenient and lightweight string formatting functionality. +- googletest: required to build and run Kineto's tests. + - **googletest is not required** if you don't want to run Kineto tests. +By default, building of tests is **on**. Turn it off by setting `KINETO_BUILD_TESTS` to **off**. + +You can download [NVIDIA CUPTI][1], [fmt][2], [googletest][3] and set +`CUDA_SOURCE_DIR`, `FMT_SOURCE_DIR`, `GOOGLETEST_SOURCE_DIR` respectively for +cmake to find these libraries. If the fmt and googletest variables are not set, cmake will +build the git submodules found in the `third_party` directory. +If `CUDA_SOURCE_DIR` is not set, libkineto will fail to build. + +### Building Libkineto + +``` +# Check out repo and sub modules +git clone --recursive https://github.com/pytorch/kineto.git +# Build libkineto with cmake +cd kineto/libkineto +mkdir build && cd build +cmake .. +make +``` + +To run the tests after building libkineto (if tests are built), use the following +command: +``` +make test +``` + +### Installing Libkineto +``` +make install +``` + +## How Libkineto works +We will provide a high-level overview, design philosophy and brief descriptions of various +parts of Libkineto in upcoming blogs. + +## Full documentation +We strive to keep our source files readable. The best and up-to-date +documentation is available in the source files. + +## License +Libkineto is BSD licensed, as detailed in the [LICENSE](../LICENSE) file. + +[1]:https://developer.nvidia.com/CUPTI-CTK10_2 +[2]:https://github.com/fmt +[3]:https://github.com/google/googletest diff --git a/tb_plugins/profiling/libkineto/include/AbstractConfig.h b/tb_plugins/profiling/libkineto/include/AbstractConfig.h new file mode 100644 index 0000000000000000000000000000000000000000..1cadf4906c11c3b5f59e290295048cee7fd63acf --- /dev/null +++ b/tb_plugins/profiling/libkineto/include/AbstractConfig.h @@ -0,0 +1,113 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#include +#include +#include +#include +#include + +namespace KINETO_NAMESPACE { + +class AbstractConfig { + public: + AbstractConfig& operator=(const AbstractConfig&) = delete; + AbstractConfig(AbstractConfig&&) = delete; + AbstractConfig& operator=(AbstractConfig&&) = delete; + + virtual ~AbstractConfig() { + for (const auto& p : featureConfigs_) { + delete p.second; + } + } + + // Return a copy of the full derived class + virtual AbstractConfig* cloneDerived(AbstractConfig& parent) const = 0; + + // Returns true if successfully parsed the config string + bool parse(const std::string& conf); + + // Default setup for signal-triggered profiling + virtual void setSignalDefaults() { + for (auto& p : featureConfigs_) { + p.second->setSignalDefaults(); + } + } + + // Default setup for client-triggered profiling + virtual void setClientDefaults() { + for (auto& p : featureConfigs_) { + p.second->setClientDefaults(); + } + } + + // Time config was created / updated + std::chrono::time_point timestamp() const { + return timestamp_; + } + + // Source config string that this was parsed from + const std::string& source() const { + return source_; + } + + AbstractConfig& feature(std::string name) const { + const auto& pos = featureConfigs_.find(name); + return *pos->second; + } + + // Transfers ownership of cfg arg + void addFeature(const std::string& name, AbstractConfig* cfg) { + featureConfigs_[name] = cfg; + } + + protected: + AbstractConfig() {} + AbstractConfig(const AbstractConfig& other) = default; + + // Return true if the option was recognized and successfully parsed. + // Throw std::invalid_argument if val is invalid. + virtual bool handleOption(const std::string& name, std::string& val); + + // Perform post-validation checks, typically conditons involving + // multiple options. + // Throw std::invalid_argument if automatic correction can not be made. + // + // @param fallbackProfileStartTime Specify a fallback profile start timestamp in case it was never specified by the client + virtual void validate(const std::chrono::time_point& fallbackProfileStartTime) = 0; + + // TODO: Separate out each profiler type into features? + virtual void printActivityProfilerConfig(std::ostream& s) const; + + // Helpers for use in handleOption + // Split a string by delimiter and remove external white space + std::vector splitAndTrim(const std::string& s, char delim) const; + // Lowercase for case-insensitive comparisons + std::string toLower(std::string& s) const; + // Does string end with suffix + bool endsWith(const std::string& s, const std::string& suffix) const; + // Conversions + int64_t toIntRange(const std::string& val, int64_t min, int64_t max) const; + int32_t toInt32(const std::string& val) const; + int64_t toInt64(const std::string& val) const; + bool toBool(std::string& val) const; + + void cloneFeaturesInto(AbstractConfig& cfg) const { + for (const auto& feature : featureConfigs_) { + cfg.featureConfigs_[feature.first] = feature.second->cloneDerived(cfg); + } + } + + private: + // Time config was created / updated + std::chrono::time_point timestamp_{}; + + // Original configuration string, used for comparison + std::string source_{""}; + + // Configuration objects for optional features + std::map featureConfigs_{}; +}; + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/include/ActivityProfilerInterface.h b/tb_plugins/profiling/libkineto/include/ActivityProfilerInterface.h new file mode 100644 index 0000000000000000000000000000000000000000..29871e47ab8af87888ccb8e20403bc26c433b5cc --- /dev/null +++ b/tb_plugins/profiling/libkineto/include/ActivityProfilerInterface.h @@ -0,0 +1,91 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#include +#include +#include +#include + +#include "ActivityType.h" +#include "ActivityTraceInterface.h" +#include "IActivityProfiler.h" + +namespace libkineto { + +class ActivityProfilerController; +struct CpuTraceBuffer; +class Config; + +class ActivityProfilerInterface { + + public: + virtual ~ActivityProfilerInterface() {}; + + virtual void init() {} + virtual bool isInitialized() { + return false; + } + virtual bool isActive(){ + return false; + } + + // *** Asynchronous API *** + // Instead of starting and stopping the trace manually, provide a start time + // and duration and / or iteration stop criterion. + // Tracing terminates when either condition is met. + virtual void scheduleTrace(const std::string& configStr) {} + + // *** Synchronous API *** + // These must be called in order: + // prepareTrace -> startTrace -> stopTrace. + + // Many tracing structures are lazily initialized during trace collection, + // with potentially high overhead. + // Call prepareTrace to enable tracing, then run the region to trace + // at least once (and ideally run the same code that is to be traced) to + // allow tracing structures to be initialized. + virtual void prepareTrace( + const std::set& activityTypes, + const std::string& configStr = "") {} + + // Start recording, potentially reusing any buffers allocated since + // prepareTrace was called. + virtual void startTrace() {} + + // Stop and process trace, producing an in-memory list of trace records. + // The processing will be done synchronously (using the calling thread.) + virtual std::unique_ptr stopTrace() { + return nullptr; + } + + // Re-evaluate internal state to allow for triggering operations based + // on number of iteration. each implicitly increments the iteration count + virtual void step() {} + + // *** TraceActivity API *** + // FIXME: Pass activityProfiler interface into clientInterface? + virtual void pushCorrelationId(uint64_t id){} + virtual void popCorrelationId(){} + virtual void transferCpuTrace( + std::unique_ptr traceBuffer){} + + // Correlation ids for user defined spans + virtual void pushUserCorrelationId(uint64_t){} + virtual void popUserCorrelationId(){} + + // Saves information for the current thread to be used in profiler output + // Client must record any new kernel thread where the activity has occured. + virtual void recordThreadInfo() {} + + // Record trace metadata, currently supporting only string key and values, + // values with the same key are overwritten + virtual void addMetadata(const std::string& key, const std::string& value) = 0; + + // Add a child activity profiler, this enables frameworks in the application + // to enable custom framework events. + virtual void addChildActivityProfiler( + std::unique_ptr profiler) {} +}; + +} // namespace libkineto diff --git a/tb_plugins/profiling/libkineto/include/ActivityTraceInterface.h b/tb_plugins/profiling/libkineto/include/ActivityTraceInterface.h new file mode 100644 index 0000000000000000000000000000000000000000..23d4edab00ce2fa90427e13818ac09c8541835ac --- /dev/null +++ b/tb_plugins/profiling/libkineto/include/ActivityTraceInterface.h @@ -0,0 +1,21 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#include +#include + +namespace libkineto { + +struct ITraceActivity; + +class ActivityTraceInterface { + public: + virtual ~ActivityTraceInterface() {} + virtual const std::vector* activities() { + return nullptr; + } + virtual void save(const std::string& path) {} +}; + +} // namespace libkineto diff --git a/tb_plugins/profiling/libkineto/include/ActivityType.h b/tb_plugins/profiling/libkineto/include/ActivityType.h new file mode 100644 index 0000000000000000000000000000000000000000..74c6a2531d6a9cee3196f9f889517926afea823f --- /dev/null +++ b/tb_plugins/profiling/libkineto/include/ActivityType.h @@ -0,0 +1,34 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#include +#include + +namespace libkineto { + +enum class ActivityType { + CPU_OP = 0, // cpu side ops + USER_ANNOTATION, + GPU_USER_ANNOTATION, + GPU_MEMCPY, + GPU_MEMSET, + CONCURRENT_KERNEL, // on-device kernels + EXTERNAL_CORRELATION, + CUDA_RUNTIME, // host side cuda runtime events + CUDA_PROFILER_RANGE, // CUPTI Profiler range for performance metrics + GLOW_RUNTIME, // host side glow runtime events + CPU_INSTANT_EVENT, // host side point-like events + PYTHON_FUNCTION, + OVERHEAD, // CUPTI induced overhead events sampled from its overhead API. + ENUM_COUNT // This is to add buffer and not used for any profiling logic. Add your new type before it. +}; + +const char* toString(ActivityType t); +ActivityType toActivityType(const std::string& str); + +// Return an array of all activity types except COUNT +constexpr int activityTypeCount = (int)ActivityType::ENUM_COUNT; +const std::array activityTypes(); + +} // namespace libkineto diff --git a/tb_plugins/profiling/libkineto/include/ClientInterface.h b/tb_plugins/profiling/libkineto/include/ClientInterface.h new file mode 100644 index 0000000000000000000000000000000000000000..06dc075838164f80e9481b34a5d5d3c136b92efd --- /dev/null +++ b/tb_plugins/profiling/libkineto/include/ClientInterface.h @@ -0,0 +1,16 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +namespace libkineto { + +class ClientInterface { + public: + virtual ~ClientInterface() {} + virtual void init() = 0; + virtual void warmup(bool setupOpInputsCollection) = 0; + virtual void start() = 0; + virtual void stop() = 0; +}; + +} // namespace libkineto diff --git a/tb_plugins/profiling/libkineto/include/Config.h b/tb_plugins/profiling/libkineto/include/Config.h new file mode 100644 index 0000000000000000000000000000000000000000..040e96c9f75ab3ab768aaebac28f959f12a3ea06 --- /dev/null +++ b/tb_plugins/profiling/libkineto/include/Config.h @@ -0,0 +1,433 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#include "AbstractConfig.h" +#include "ActivityType.h" + +#include +#include +#include +#include +#include +#include + +namespace KINETO_NAMESPACE { + +using namespace libkineto; + +class Config : public AbstractConfig { + public: + Config(); + Config& operator=(const Config&) = delete; + Config(Config&&) = delete; + Config& operator=(Config&&) = delete; + + // Return a full copy including feature config object + std::unique_ptr clone() const { + auto cfg = std::unique_ptr(new Config(*this)); + cloneFeaturesInto(*cfg); + return cfg; + } + + bool handleOption(const std::string& name, std::string& val) override; + + void setClientDefaults() override; + + // Log events to this file + const std::string& eventLogFile() const { + return eventLogFile_; + } + + bool activityProfilerEnabled() const { + return activityProfilerEnabled_ || + activitiesOnDemandTimestamp_.time_since_epoch().count() > 0; + } + + // Log activitiy trace to this file + const std::string& activitiesLogFile() const { + return activitiesLogFile_; + } + + // Log activitiy trace to this url + const std::string& activitiesLogUrl() const { + return activitiesLogUrl_; + } + + void setActivitiesLogUrl(const std::string& url) { + activitiesLogUrl_ = url; + } + + bool activitiesLogToMemory() const { + return activitiesLogToMemory_; + } + + // Is profiling enabled for the given device? + bool eventProfilerEnabledForDevice(uint32_t dev) const { + return 0 != (eventProfilerDeviceMask_ & (1 << dev)); + } + + // Take a sample (read hardware counters) at this frequency. + // This controls how often counters are read - if all counters cannot + // be collected simultaneously then multiple samples are needed to + // collect all requested counters - see multiplex period. + std::chrono::milliseconds samplePeriod() const { + return samplePeriod_; + } + + void setSamplePeriod(std::chrono::milliseconds period) { + samplePeriod_ = period; + } + + // When all requested counters cannot be collected simultaneously, + // counters will be multiplexed at this frequency. + // Multiplexing can have a large performance impact if done frequently. + // To avoid a perf impact, keep this at 1s or above. + std::chrono::milliseconds multiplexPeriod() const { + return multiplexPeriod_; + } + + void setMultiplexPeriod(std::chrono::milliseconds period) { + multiplexPeriod_ = period; + } + + // Report counters at this frequency. Note that several samples can + // be reported each time, see samplesPerReport. + std::chrono::milliseconds reportPeriod() const { + return reportPeriod_; + } + + void setReportPeriod(std::chrono::milliseconds msecs); + + // Number of samples dispatched each report period. + // Must be in the range [1, report period / sample period]. + // In other words, aggregation is supported but not interpolation. + int samplesPerReport() const { + return samplesPerReport_; + } + + void setSamplesPerReport(int count) { + samplesPerReport_ = count; + } + + // The names of events to collect + const std::set& eventNames() const { + return eventNames_; + } + + // Add additional events to be profiled + void addEvents(const std::set& names) { + eventNames_.insert(names.begin(), names.end()); + } + + // The names of metrics to collect + const std::set& metricNames() const { + return metricNames_; + } + + // Add additional metrics to be profiled + void addMetrics(const std::set& names) { + metricNames_.insert(names.begin(), names.end()); + } + + const std::vector& percentiles() const { + return eventReportPercentiles_; + } + + // Profile for this long, then revert to base config + std::chrono::seconds eventProfilerOnDemandDuration() const { + return eventProfilerOnDemandDuration_; + } + + void setEventProfilerOnDemandDuration(std::chrono::seconds duration) { + eventProfilerOnDemandDuration_ = duration; + } + + // Too many event profilers on a single system can overload the driver. + // At some point, latencies shoot through the roof and collection of samples + // becomes impossible. To avoid this situation we have a limit of profilers + // per GPU. + // NOTE: Communication with a daemon is needed for this feature. + // Library must be built with an active DaemonConfigLoader. + int maxEventProfilersPerGpu() const { + return eventProfilerMaxInstancesPerGpu_; + } + + // On Cuda11 we've seen occasional hangs when reprogramming counters + // Monitor profiling threads and report when a thread is not responding + // for a given number of seconds. + // A period of 0 means disable. + std::chrono::seconds eventProfilerHeartbeatMonitorPeriod() const { + return eventProfilerHeartbeatMonitorPeriod_; + } + + // The types of activities selected in the configuration file + const std::set& selectedActivityTypes() const { + return selectedActivityTypes_; + } + + void setSelectedActivityTypes(const std::set& types) { + selectedActivityTypes_ = types; + } + + bool isOpInputsCollectionEnabled() const { + return enableOpInputsCollection_; + } + + // Trace for this long + std::chrono::milliseconds activitiesDuration() const { + return activitiesDuration_; + } + + // Trace for this many iterations, determined by external API + int activitiesRunIterations() const { + return activitiesRunIterations_; + } + + std::chrono::milliseconds activitiesDurationDefault() const; + + void setActivitiesDuration(std::chrono::milliseconds duration) { + activitiesDuration_ = duration; + } + + int activitiesMaxGpuBufferSize() const { + return activitiesMaxGpuBufferSize_; + } + + std::chrono::seconds activitiesWarmupDuration() const { + return activitiesWarmupDuration_; + } + + int activitiesWarmupIterations() const { + return activitiesWarmupIterations_; + } + + // Timestamp at which the profiling to start, requested by the user. + const std::chrono::time_point requestTimestamp() + const { + if (profileStartTime_.time_since_epoch().count()) { + return profileStartTime_; + } + + // TODO(T94634890): Deperecate requestTimestamp + return requestTimestamp_ + maxRequestAge() + activitiesWarmupDuration(); + } + + bool hasProfileStartTime() const { + return requestTimestamp_.time_since_epoch().count() > 0 || + profileStartTime_.time_since_epoch().count() > 0; + } + + int profileStartIteration() const { + return profileStartIteration_; + } + + bool hasProfileStartIteration() const { + return profileStartIteration_ >= 0 && activitiesRunIterations_ > 0; + } + + void setProfileStartIteration(int iter) { + profileStartIteration_ = iter; + } + + int profileStartIterationRoundUp() const { + return profileStartIterationRoundUp_; + } + + // calculate the start iteration accounting for warmup + int startIterationIncludingWarmup() const { + if (!hasProfileStartIteration()) { + return -1; + } + return profileStartIteration_ - activitiesWarmupIterations_; + } + + const std::chrono::seconds maxRequestAge() const; + + // All VLOG* macros will log if the verbose log level is >= + // the verbosity specified for the verbose log message. + // Default value is -1, so messages with log level 0 will log by default. + int verboseLogLevel() const { + return verboseLogLevel_; + } + + // Modules for which verbose logging is enabled. + // If empty, logging is enabled for all modules. + const std::vector& verboseLogModules() const { + return verboseLogModules_; + } + + bool sigUsr2Enabled() const { + return enableSigUsr2_; + } + + bool ipcFabricEnabled() const { + return enableIpcFabric_; + } + + static std::chrono::milliseconds alignUp( + std::chrono::milliseconds duration, + std::chrono::milliseconds alignment) { + duration += alignment; + return duration - (duration % alignment); + } + + std::chrono::time_point + eventProfilerOnDemandStartTime() const { + return eventProfilerOnDemandTimestamp_; + } + + std::chrono::time_point + eventProfilerOnDemandEndTime() const { + return eventProfilerOnDemandTimestamp_ + eventProfilerOnDemandDuration_; + } + + std::chrono::time_point + activityProfilerRequestReceivedTime() const { + return activitiesOnDemandTimestamp_; + } + + // Users may request and set trace id and group trace id. + const std::string& requestTraceID() const { + return requestTraceID_; + } + + void setRequestTraceID(const std::string& tid) { + requestTraceID_ = tid; + } + + const std::string& requestGroupTraceID() const { + return requestGroupTraceID_; + } + + void setRequestGroupTraceID(const std::string& gtid) { + requestGroupTraceID_ = gtid; + } + + void updateActivityProfilerRequestReceivedTime(); + + void printActivityProfilerConfig(std::ostream& s) const override; + + void validate( + const std::chrono::time_point& fallbackProfileStartTime) override; + + static void addConfigFactory( + std::string name, + std::function factory); + + void print(std::ostream& s) const; + + private: + explicit Config(const Config& other) = default; + + AbstractConfig* cloneDerived(AbstractConfig& parent) const override { + // Clone from AbstractConfig not supported + assert(false); + return nullptr; + } + + uint8_t createDeviceMask(const std::string& val); + + // Adds valid activity types from the user defined string list in the + // configuration file + void setActivityTypes(const std::vector& selected_activities); + + // Sets the default activity types to be traced + void selectDefaultActivityTypes() { + // If the user has not specified an activity list, add all types + for (ActivityType t : activityTypes()) { + // Do no enable this by default + // TODO: introduce optional types + if (t != ActivityType::OVERHEAD) { + selectedActivityTypes_.insert(t); + } + } + } + + int verboseLogLevel_; + std::vector verboseLogModules_; + + // Event profiler + // These settings are also supported in on-demand mode + std::chrono::milliseconds samplePeriod_; + std::chrono::milliseconds reportPeriod_; + int samplesPerReport_; + std::set eventNames_; + std::set metricNames_; + + // On-demand duration + std::chrono::seconds eventProfilerOnDemandDuration_; + // Last on-demand request + std::chrono::time_point + eventProfilerOnDemandTimestamp_; + + int eventProfilerMaxInstancesPerGpu_; + + // Monitor whether event profiler threads are stuck + // at this frequency + std::chrono::seconds eventProfilerHeartbeatMonitorPeriod_; + + // These settings can not be changed on-demand + std::string eventLogFile_; + std::vector eventReportPercentiles_ = {5, 25, 50, 75, 95}; + uint8_t eventProfilerDeviceMask_ = ~0; + std::chrono::milliseconds multiplexPeriod_; + + // Activity profiler + bool activityProfilerEnabled_; + std::set selectedActivityTypes_; + + // The activity profiler settings are all on-demand + std::string activitiesLogFile_; + + std::string activitiesLogUrl_; + + // Log activities to memory buffer + bool activitiesLogToMemory_{false}; + + int activitiesMaxGpuBufferSize_; + std::chrono::seconds activitiesWarmupDuration_; + int activitiesWarmupIterations_; + + // Client Interface + // Enable inputs collection when tracing ops + bool enableOpInputsCollection_{true}; + + // Profile for specified iterations and duration + std::chrono::milliseconds activitiesDuration_; + int activitiesRunIterations_; + + // Below are not used + // Use this net name for iteration count + std::string activitiesExternalAPIIterationsTarget_; + // Only profile nets that includes this in the name + std::vector activitiesExternalAPIFilter_; + // Only profile nets with at least this many operators + int activitiesExternalAPINetSizeThreshold_; + // Only profile nets with at least this many GPU operators + int activitiesExternalAPIGpuOpCountThreshold_; + // Last activity profiler request + std::chrono::time_point + activitiesOnDemandTimestamp_; + + // Synchronized start timestamp + std::chrono::time_point profileStartTime_; + // or start iteration + int profileStartIteration_; + int profileStartIterationRoundUp_; + + // DEPRECATED + std::chrono::time_point requestTimestamp_; + + // Enable profiling via SIGUSR2 + bool enableSigUsr2_; + + // Enable IPC Fabric instead of thrift communication + bool enableIpcFabric_; + + // Logger Metadata + std::string requestTraceID_; + std::string requestGroupTraceID_; +}; + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/include/GenericTraceActivity.h b/tb_plugins/profiling/libkineto/include/GenericTraceActivity.h new file mode 100644 index 0000000000000000000000000000000000000000..4272cf1efa4e7613a46c3684270b4e803853345b --- /dev/null +++ b/tb_plugins/profiling/libkineto/include/GenericTraceActivity.h @@ -0,0 +1,125 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#include +#include +#include +#include + +#include "ThreadUtil.h" +#include "ITraceActivity.h" +#include "TraceSpan.h" + +namespace libkineto { + +// Link type, used in GenericTraceActivity.flow.type +constexpr unsigned int kLinkFwdBwd = 1; +constexpr unsigned int kLinkAsyncCpuGpu = 2; + +// @lint-ignore-every CLANGTIDY cppcoreguidelines-non-private-member-variables-in-classes +// @lint-ignore-every CLANGTIDY cppcoreguidelines-pro-type-member-init +class GenericTraceActivity : public ITraceActivity { + + public: + GenericTraceActivity() : activityType(ActivityType::ENUM_COUNT), traceSpan_(NULL) {} + + GenericTraceActivity( + const TraceSpan& trace, ActivityType type, const std::string& name) + : activityType(type), activityName(name), traceSpan_(&trace) { + } + + int64_t deviceId() const override { + return device; + } + + int64_t resourceId() const override { + return resource; + } + + int32_t getThreadId() const override { + return threadId; + } + + int64_t timestamp() const override { + return startTime; + } + + int64_t duration() const override { + return endTime - startTime; + } + + int64_t correlationId() const override { + return id; + } + + ActivityType type() const override { + return activityType; + } + + const ITraceActivity* linkedActivity() const override { + return nullptr; + } + + int flowType() const override { + return flow.type; + } + + int flowId() const override { + return flow.id; + } + + bool flowStart() const override { + return flow.start; + } + + const std::string name() const override { + return activityName; + } + + const TraceSpan* traceSpan() const override { + return traceSpan_; + } + + void log(ActivityLogger& logger) const override; + + //Encode client side metadata as a key/value + template + void addMetadata(const std::string& key, const ValType& value) { + metadata_.push_back(fmt::format("\"{}\": {}", key, value)); + } + + void addMetadataQuoted(const std::string& key, const std::string& value) { + metadata_.push_back(fmt::format("\"{}\": \"{}\"", key, value)); + } + + const std::string metadataJson() const override { + return fmt::format("{}", fmt::join(metadata_, ", ")); + } + + virtual ~GenericTraceActivity() {}; + + int64_t startTime{0}; + int64_t endTime{0}; + int32_t id{0}; + int32_t device{0}; + int32_t resource{0}; + int32_t threadId{0}; + ActivityType activityType; + std::string activityName; + struct Flow { + Flow(): id(0), type(0), start(0) {} + // Ids must be unique within each type + uint32_t id : 27; + // Type will be used to connect flows between profilers, as + // well as look up flow information (name etc) + uint32_t type : 4; + uint32_t start : 1; + } flow; + + private: + const TraceSpan* traceSpan_; + std::vector metadata_; +}; + +} // namespace libkineto diff --git a/tb_plugins/profiling/libkineto/include/IActivityProfiler.h b/tb_plugins/profiling/libkineto/include/IActivityProfiler.h new file mode 100644 index 0000000000000000000000000000000000000000..f5d4b3fb828a3348d948c6487acc6a9e5a18f836 --- /dev/null +++ b/tb_plugins/profiling/libkineto/include/IActivityProfiler.h @@ -0,0 +1,104 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#include +#include +#include + +#include "Config.h" +#include "GenericTraceActivity.h" + +/* This file includes an abstract base class for an activity profiler + * that can be implemented by multiple tracing agents in the application. + * The high level Kineto profiler can co-ordinate start and end of tracing + * and combine together events from multiple such activity profilers. + */ + +namespace libkineto { + +using namespace KINETO_NAMESPACE; + +#ifdef _MSC_VER +// workaround for the predefined ERROR macro on Windows +#undef ERROR +#endif // _MSC_VER + +enum class TraceStatus { + READY, // Accepting trace requests + WARMUP, // Performing trace warmup + RECORDING, // Actively collecting activities + PROCESSING, // Recording is complete, preparing results + ERROR, // One or more errors (and possibly also warnings) occurred. + WARNING, // One or more warnings occurred. +}; + +/* IActivityProfilerSession: + * an opaque object that can be used by a high level profiler to + * start/stop and return trace events. + */ +class IActivityProfilerSession { + + public: + virtual ~IActivityProfilerSession() {} + + // start the trace collection synchronously + virtual void start() = 0; + + // stop the trace collection synchronously + virtual void stop() = 0; + + TraceStatus status() { + return status_; + } + + // returns list of Trace Activities + virtual std::vector& activities() = 0; + + // returns errors with this trace + virtual std::vector errors() = 0; + + // processes trace activities using logger + virtual void processTrace(ActivityLogger& logger) = 0; + + // XXX define trace formats + // virtual save(string name, TraceFormat format) + + protected: + TraceStatus status_ = TraceStatus::READY; +}; + + +/* Activity Profiler Plugins: + * These allow other frameworks to integrate into Kineto's primariy + * activity profiler. While the primary activity profiler handles + * timing the trace collections and correlating events the plugins + * can become source of new trace activity types. + */ +class IActivityProfiler { + + public: + + virtual ~IActivityProfiler() {} + + // name of profiler + virtual const std::string& name() const = 0; + + // returns activity types this profiler supports + virtual const std::set& availableActivities() const = 0; + + // Calls prepare() on registered tracer providers passing in the relevant + // activity types. Returns a profiler session handle + virtual std::unique_ptr configure( + const std::set& activity_types, + const Config& config) = 0; + + // asynchronous version of the above with future timestamp and duration. + virtual std::unique_ptr configure( + int64_t ts_ms, + int64_t duration_ms, + const std::set& activity_types, + const Config& config) = 0; +}; + +} // namespace libkineto diff --git a/tb_plugins/profiling/libkineto/include/ILoggerObserver.h b/tb_plugins/profiling/libkineto/include/ILoggerObserver.h new file mode 100644 index 0000000000000000000000000000000000000000..4fce7851b9669ff93a3f3a772140b0466674853c --- /dev/null +++ b/tb_plugins/profiling/libkineto/include/ILoggerObserver.h @@ -0,0 +1,50 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#include + +// Stages in libkineto used when pushing logs to UST Logger. +constexpr char kWarmUpStage[] = "Warm Up"; +constexpr char kCollectionStage[] = "Collection"; +constexpr char kPostProcessingStage[] = "Post Processing"; + +#if !USE_GOOGLE_LOG + +#include +#include + +namespace libkineto { + +enum LoggerOutputType { + VERBOSE = 0, + INFO = 1, + WARNING = 2, + ERROR = 3, + STAGE = 4, + ENUM_COUNT = 5 +}; + +const char* toString(LoggerOutputType t); +LoggerOutputType toLoggerOutputType(const std::string& str); + +constexpr int LoggerTypeCount = (int) LoggerOutputType::ENUM_COUNT; + +class ILoggerObserver { + public: + virtual ~ILoggerObserver() = default; + virtual void write(const std::string& message, LoggerOutputType ot) = 0; + virtual const std::map> extractCollectorMetadata() = 0; + virtual void reset() = 0; + virtual void addDevice(const int64_t device) = 0; + virtual void setTraceDurationMS(const int64_t duration) = 0; + virtual void addEventCount(const int64_t count) = 0; + virtual void setTraceID(const std::string&) {} + virtual void setGroupTraceID(const std::string&) {} + virtual void addDestination(const std::string& dest) = 0; + +}; + +} // namespace libkineto + +#endif // !USE_GOOGLE_LOG diff --git a/tb_plugins/profiling/libkineto/include/ITraceActivity.h b/tb_plugins/profiling/libkineto/include/ITraceActivity.h new file mode 100644 index 0000000000000000000000000000000000000000..a477ed814662cb4c57738b7e40ec6052e9f65288 --- /dev/null +++ b/tb_plugins/profiling/libkineto/include/ITraceActivity.h @@ -0,0 +1,53 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#include + +#include "ActivityType.h" + +namespace libkineto { + +class ActivityLogger; +struct TraceSpan; + +// Generic activity interface is borrowed from tensorboard protobuf format. +struct ITraceActivity { + virtual ~ITraceActivity() {} + // Device is a physical or logical entity, e.g. CPU, GPU or process + virtual int64_t deviceId() const = 0; + // A resource is something on the device, h/w thread, + // functional units etc. + virtual int64_t resourceId() const = 0; + // s/w thread + virtual int32_t getThreadId() const = 0; + // Start timestamp in mucrosecond + virtual int64_t timestamp() const = 0; + // Duration in microseconds + virtual int64_t duration() const = 0; + // Used to link up async activities + virtual int64_t correlationId() const = 0; + // Part of a flow, identified by flow id and type + virtual int flowType() const = 0; + virtual int flowId() const = 0; + virtual bool flowStart() const = 0; + virtual ActivityType type() const = 0; + virtual const std::string name() const = 0; + // Optional linked activity + virtual const ITraceActivity* linkedActivity() const = 0; + // Optional containing trace object + virtual const TraceSpan* traceSpan() const = 0; + // Log activity + virtual void log(ActivityLogger& logger) const = 0; + // Return json formatted metadata + // FIXME: Return iterator to dynamic type map here instead + virtual const std::string metadataJson() const = 0; + + static int64_t nsToUs(int64_t ns) { + // It's important that this conversion is the same everywhere. + // No rounding! + return ns / 1000; + } +}; + +} // namespace libkineto diff --git a/tb_plugins/profiling/libkineto/include/ThreadUtil.h b/tb_plugins/profiling/libkineto/include/ThreadUtil.h new file mode 100644 index 0000000000000000000000000000000000000000..d1dc80ad2ab0dfd3bea313363fb0e6565349889c --- /dev/null +++ b/tb_plugins/profiling/libkineto/include/ThreadUtil.h @@ -0,0 +1,22 @@ +#pragma once + +#include +#include +#include +#include + +namespace libkineto { + +int32_t systemThreadId(); +int32_t threadId(); +bool setThreadName(const std::string& name); +std::string getThreadName(); + +int32_t processId(); +std::string processName(int32_t pid); + +// Return a list of pids and process names for the current process +// and its parents. +std::vector> pidCommandPairsOfAncestors(); + +} // namespace libkineto diff --git a/tb_plugins/profiling/libkineto/include/TraceSpan.h b/tb_plugins/profiling/libkineto/include/TraceSpan.h new file mode 100644 index 0000000000000000000000000000000000000000..af9a9d5ee556830ac34568e6c81ec4f8f00da2e3 --- /dev/null +++ b/tb_plugins/profiling/libkineto/include/TraceSpan.h @@ -0,0 +1,36 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#include +#include +#include + +namespace libkineto { + +struct TraceSpan { + TraceSpan() = delete; + TraceSpan( + int64_t startTime, int64_t endTime, std::string name) + : startTime(startTime), endTime(endTime), name(std::move(name)) { + } + TraceSpan( + int opCount, int it, std::string name, std::string prefix) + : opCount(opCount), + iteration(it), + name(std::move(name)), + prefix(std::move(prefix)) { + } + + // FIXME: change to duration? + int64_t startTime{0}; + int64_t endTime{0}; + int opCount{0}; + int iteration{-1}; + // Name is used to identify timeline + std::string name; + // Prefix used to distinguish trace spans on the same timeline + std::string prefix; +}; + +} // namespace libkineto diff --git a/tb_plugins/profiling/libkineto/include/libkineto.h b/tb_plugins/profiling/libkineto/include/libkineto.h new file mode 100644 index 0000000000000000000000000000000000000000..87c3d64f638dad9d1c2d24c013135db60d477642 --- /dev/null +++ b/tb_plugins/profiling/libkineto/include/libkineto.h @@ -0,0 +1,138 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +// Mediator for initialization and profiler control + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ActivityProfilerInterface.h" +#include "ActivityType.h" +#include "ClientInterface.h" +#include "GenericTraceActivity.h" +#include "TraceSpan.h" +#include "IActivityProfiler.h" +#include "ActivityTraceInterface.h" + +#include "ThreadUtil.h" + +extern "C" { + void suppressLibkinetoLogMessages(); + int InitializeInjection(void); + bool libkineto_init(bool cpuOnly, bool logOnError); +} + +namespace libkineto { + +class Config; +class ConfigLoader; + +struct CpuTraceBuffer { + TraceSpan span{0, 0, "none"}; + int gpuOpCount; + std::deque activities; +}; + +using ChildActivityProfilerFactory = + std::function()>; + +class LibkinetoApi { + public: + + explicit LibkinetoApi(ConfigLoader& configLoader) + : configLoader_(configLoader) { + } + + // Called by client that supports tracing API. + // libkineto can still function without this. + void registerClient(ClientInterface* client); + + // Called by libkineto on init + void registerProfiler(std::unique_ptr profiler) { + activityProfiler_ = std::move(profiler); + initClientIfRegistered(); + } + + ActivityProfilerInterface& activityProfiler() { + return *activityProfiler_; + } + + ClientInterface* client() { + return client_; + } + + void initProfilerIfRegistered() { + static std::once_flag once; + if (activityProfiler_) { + std::call_once(once, [this] { + if (!activityProfiler_->isInitialized()) { + activityProfiler_->init(); + initChildActivityProfilers(); + } + }); + } + } + + bool isProfilerInitialized() const { + return activityProfiler_ && activityProfiler_->isInitialized(); + } + + bool isProfilerRegistered() const { + return activityProfiler_ != nullptr; + } + + void suppressLogMessages() { + suppressLibkinetoLogMessages(); + } + + // Provides access to profier configuration manaegement + ConfigLoader& configLoader() { + return configLoader_; + } + + void registerProfilerFactory( + ChildActivityProfilerFactory factory) { + if (isProfilerInitialized()) { + activityProfiler_->addChildActivityProfiler(factory()); + } else { + childProfilerFactories_.push_back(factory); + } + } + + private: + + void initChildActivityProfilers() { + if (!isProfilerInitialized()) { + return; + } + for (const auto& factory : childProfilerFactories_) { + activityProfiler_->addChildActivityProfiler(factory()); + } + childProfilerFactories_.clear(); + } + + // Client is initialized once both it and libkineto has registered + void initClientIfRegistered(); + + ConfigLoader& configLoader_; + std::unique_ptr activityProfiler_{}; + ClientInterface* client_{}; + int32_t clientRegisterThread_{0}; + + bool isLoaded_{false}; + std::vector childProfilerFactories_; +}; + +// Singleton +LibkinetoApi& api(); + +} // namespace libkineto diff --git a/tb_plugins/profiling/libkineto/include/time_since_epoch.h b/tb_plugins/profiling/libkineto/include/time_since_epoch.h new file mode 100644 index 0000000000000000000000000000000000000000..caa6b4d92760d384eca2b1383a679fe7435c53b3 --- /dev/null +++ b/tb_plugins/profiling/libkineto/include/time_since_epoch.h @@ -0,0 +1,16 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#include + +namespace libkineto { + +inline int64_t timeSinceEpoch( + const std::chrono::time_point& t) { + return std::chrono::duration_cast( + t.time_since_epoch()) + .count(); +} + +} // namespace libkineto diff --git a/tb_plugins/profiling/libkineto/libkineto_defs.bzl b/tb_plugins/profiling/libkineto/libkineto_defs.bzl new file mode 100644 index 0000000000000000000000000000000000000000..330c54a22dfcedf895f0eba4077713a7c4cd8072 --- /dev/null +++ b/tb_plugins/profiling/libkineto/libkineto_defs.bzl @@ -0,0 +1,77 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +def get_libkineto_api_srcs(): + return [ + "src/ThreadUtil.cpp", + "src/libkineto_api.cpp", + ] + +def get_libkineto_cupti_srcs(with_api = True): + return [ + "src/CudaDeviceProperties.cpp", + "src/CuptiActivityApi.cpp", + "src/CuptiActivityPlatform.cpp", + "src/CuptiCallbackApi.cpp", + "src/CuptiEventApi.cpp", + "src/CuptiMetricApi.cpp", + "src/CuptiRangeProfilerApi.cpp", + "src/Demangle.cpp", + "src/EventProfiler.cpp", + "src/EventProfilerController.cpp", + "src/WeakSymbols.cpp", + "src/cupti_strings.cpp", + ] + (get_libkineto_cpu_only_srcs(with_api)) + +def get_libkineto_roctracer_srcs(with_api = True): + return [ + "src/RoctracerActivityApi.cpp", + ] + (get_libkineto_cpu_only_srcs(with_api)) + +def get_libkineto_cpu_only_srcs(with_api = True): + return [ + "src/AbstractConfig.cpp", + "src/CuptiActivityProfiler.cpp", + "src/ActivityProfilerController.cpp", + "src/ActivityProfilerProxy.cpp", + "src/ActivityType.cpp", + "src/Config.cpp", + "src/ConfigLoader.cpp", + "src/CuptiActivityApi.cpp", + "src/Demangle.cpp", + "src/GenericTraceActivity.cpp", + "src/ILoggerObserver.cpp", + "src/Logger.cpp", + "src/init.cpp", + "src/output_csv.cpp", + "src/output_json.cpp", + ] + (get_libkineto_api_srcs() if with_api else []) + +def get_libkineto_public_headers(): + return [ + "include/AbstractConfig.h", + "include/ActivityProfilerInterface.h", + "include/ActivityType.h", + "include/Config.h", + "include/ClientInterface.h", + "include/GenericTraceActivity.h", + "include/GenericTraceActivity.h", + "include/IActivityProfiler.h", + "include/ILoggerObserver.h", + "include/ITraceActivity.h", + "include/TraceSpan.h", + "include/ThreadUtil.h", + "include/libkineto.h", + "include/time_since_epoch.h", + ] + +# kineto code should be updated to not have to +# suppress these warnings. +KINETO_COMPILER_FLAGS = [ + "-fexceptions", + "-Wno-deprecated-declarations", + "-Wno-unused-function", + "-Wno-unused-private-field", +] diff --git a/tb_plugins/profiling/libkineto/sample_programs/kineto_playground.cpp b/tb_plugins/profiling/libkineto/sample_programs/kineto_playground.cpp new file mode 100644 index 0000000000000000000000000000000000000000..780047912ed09996d3952901267d46aab99cf78c --- /dev/null +++ b/tb_plugins/profiling/libkineto/sample_programs/kineto_playground.cpp @@ -0,0 +1,38 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include +#include +#include + +#include +#include + +#include "kineto/libkineto/sample_programs/kineto_playground.cuh" + +using namespace kineto; + +static const std::string kFileName = "/tmp/kineto_playground_trace.json"; + +int main() { + warmup(); + + // Kineto config + + // Empty types set defaults to all types + std::set types; + + auto& profiler = libkineto::api().activityProfiler(); + libkineto::api().initProfilerIfRegistered(); + profiler.prepareTrace(types); + + // Good to warm up after prepareTrace to get cupti initialization to settle + warmup(); + profiler.startTrace(); + playground(); + + auto trace = profiler.stopTrace(); + LOG(INFO) << "Stopped and processed trace. Got " << trace->activities()->size() << " activities."; + trace->save(kFileName); + return 0; +} + diff --git a/tb_plugins/profiling/libkineto/sample_programs/kineto_playground.cu b/tb_plugins/profiling/libkineto/sample_programs/kineto_playground.cu new file mode 100644 index 0000000000000000000000000000000000000000..54c6f82ff4be2e468c0e868b49b3a9130de97490 --- /dev/null +++ b/tb_plugins/profiling/libkineto/sample_programs/kineto_playground.cu @@ -0,0 +1,60 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include + +#include "kineto_playground.cuh" + + +namespace kineto { + +void warmup(void) { + // Inititalizing CUDA can take a while which we normally do not want to see in Kineto traces. + // This is done in various ways that take Kineto as dependency. This is our way of doing warmup + // for kineto_playground + size_t bytes = 1000; + float* mem = NULL; + auto error = cudaMalloc(&mem, bytes); + if (error != cudaSuccess) { + printf("cudaMalloc failed during kineto_playground warmup. error code: %d", error); + return; + } + + cudaFree(mem); +} + +void basicMemcpyMemset(void) { + size_t size = (1 << 8) * sizeof(float); + float *hostMemSrc, *deviceMem, *hostMemDst; + cudaError_t err; + + hostMemSrc = (float*)malloc(size); + hostMemDst = (float*)malloc(size); + err = cudaMalloc(&deviceMem, size); + if (err != cudaSuccess) { + printf("cudaMalloc failed during %s", __func__); + return; + } + + memset(hostMemSrc, 1, size); + cudaMemcpy(deviceMem, hostMemSrc, size, cudaMemcpyHostToDevice); + if (err != cudaSuccess) { + printf("cudaMemcpy failed during %s", __func__); + return; + } + + cudaMemcpy(hostMemDst, deviceMem, size, cudaMemcpyDeviceToHost); + if (err != cudaSuccess) { + printf("cudaMemcpy failed during %s", __func__); + return; + } + + free(hostMemSrc); + free(hostMemDst); + cudaFree(deviceMem); +} + +void playground(void) { + // Add your experimental CUDA implementation here. +} + +} diff --git a/tb_plugins/profiling/libkineto/sample_programs/kineto_playground.cuh b/tb_plugins/profiling/libkineto/sample_programs/kineto_playground.cuh new file mode 100644 index 0000000000000000000000000000000000000000..54e1ee59ada9ae88370b38146567ed87be2b914b --- /dev/null +++ b/tb_plugins/profiling/libkineto/sample_programs/kineto_playground.cuh @@ -0,0 +1,18 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#include + +namespace kineto { + +// Warms up CUDA before the tracing starts +void warmup(void); + +// Basic usage of cudaMemcpy and cudaMemset +void basicMemcpyMemset(void); + +// Your experimental code goes in here! +void playground(void); + +} diff --git a/tb_plugins/profiling/libkineto/src/AbstractConfig.cpp b/tb_plugins/profiling/libkineto/src/AbstractConfig.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d60ab43c9a3e198167beb7987d619b0bb8e9ed13 --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/AbstractConfig.cpp @@ -0,0 +1,188 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include "AbstractConfig.h" + +#include +#include +#include + +#include "Logger.h" + +using namespace std::chrono; + +using std::string; +using std::vector; + +namespace KINETO_NAMESPACE { + +constexpr char kWhitespace[] = "\t\n "; + +static bool isWhitespace(string& s) { + return s.find_first_not_of(kWhitespace) == string::npos; +} + +// Remove whitespace from both end of string +static inline string trim(string& s) { + if (s.empty()) { + return s; + } else if (isWhitespace(s)) { + return ""; + } + auto start = s.find_first_not_of(kWhitespace); + auto end = s.find_last_not_of(kWhitespace); + return s.substr(start, end - start + 1); +} + +// Helper function for split. +// Return the index of char d in string s. +// If not found, returns the length of the string. +static int find(const char* s, char delim) { + int i; + for (i = 0; s[i]; i++) { + if (s[i] == delim) { + break; + } + } + return i; +} + +// Split a string by delimiter +static vector split(const string& s, char delim) { + vector res; + const char* cs = s.c_str(); + for (int i = find(cs, delim); cs[i]; cs += i + 1, i = find(cs, delim)) { + res.emplace_back(cs, i); + } + res.emplace_back(cs); + return res; +} + +// Remove a trailing comment. +static inline string stripComment(const string& s) { + std::size_t pos = s.find("#"); + return s.substr(0, pos); +} + +string AbstractConfig::toLower(string& s) const { + string res = s; + for (int i = 0; i < res.size(); i++) { + if (res[i] >= 'A' && res[i] <= 'Z') { + res[i] += ('a' - 'A'); + } + } + return res; +} + +bool AbstractConfig::endsWith(const string& s, const string& suffix) const { + if (suffix.size() > s.size()) { + return false; + } + return s.compare(s.size() - suffix.size(), suffix.size(), suffix) == 0; +} + +vector AbstractConfig::splitAndTrim(const string& s, char delim) const { + auto res = split(s, delim); + for (string& x : res) { + x = trim(x); + } + return res; +} + +int64_t AbstractConfig::toIntRange(const string& val, int64_t min, int64_t max) + const { + char* invalid; + int64_t res = strtoll(val.c_str(), &invalid, 10); + if (val.empty() || *invalid) { + throw std::invalid_argument(fmt::format("Invalid integer: {}", val)); + } else if (res < min || res > max) { + throw std::invalid_argument(fmt::format( + "Invalid argument: {} - expected range [{}, {}]", res, min, max)); + } + return res; +} + +int32_t AbstractConfig::toInt32(const string& val) const { + return toIntRange(val, 0, ~0u / 2); +} + +int64_t AbstractConfig::toInt64(const string& val) const { + return toIntRange(val, 0, ~0ul / 2); +} + +bool AbstractConfig::toBool(string& val) const { + const std::array bool_vals{ + "n", "y", "no", "yes", "f", "t", "false", "true"}; + const string lower_val = toLower(val); + for (int i = 0; i < bool_vals.size(); i++) { + if (lower_val == bool_vals[i]) { + return i % 2; + } + } + throw std::invalid_argument(fmt::format("Invalid bool argument: {}", val)); + return false; +} + +bool AbstractConfig::parse(const string& conf) { + std::istringstream iss(conf); + string line; + + timestamp_ = system_clock::now(); + + // Read the string stream 1 line at a time to parse. + while (std::getline(iss, line)) { + line = stripComment(line); + if (isWhitespace(line)) { + continue; + } + vector key_val = splitAndTrim(line, '='); + if (key_val.size() != 2) { + LOG(ERROR) << "Invalid config line: " << line; + return false; + } else { + bool handled = false; + try { + handled = handleOption(key_val[0], key_val[1]); + if (!handled) { + for (auto& feature_cfg : featureConfigs_) { + if (feature_cfg.second->handleOption(key_val[0], key_val[1])) { + handled = true; + break; + } + } + } + } catch (const std::exception& e) { + LOG(ERROR) << "Failed to parse config line: " << line; + LOG(ERROR) << e.what(); + return false; + } + if (!handled) { + // This might be due to using a newer config option on an + // older binary where it is not supported. In this case, + // print a warning message - but it is expected to work! + LOG(WARNING) << "Unrecognized config line: " << line; + } + } + } + + validate(timestamp_); + + // Store original text, used to detect updates + source_ = conf; + timestamp_ = system_clock::now(); + return true; +} + +bool AbstractConfig::handleOption( + const std::string& /* unused */, + std::string& /* unused */) { + LOG(ERROR) << "handleOption unimplemented"; + return false; +} + +void AbstractConfig::printActivityProfilerConfig(std::ostream& s) const { + for (const auto& feature_cfg : featureConfigs_) { + feature_cfg.second->printActivityProfilerConfig(s); + } +} + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/ActivityBuffers.h b/tb_plugins/profiling/libkineto/src/ActivityBuffers.h new file mode 100644 index 0000000000000000000000000000000000000000..157af879379a5f5fc5e274f22604987a97f17af4 --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/ActivityBuffers.h @@ -0,0 +1,29 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + + +#include +#include + +#include "libkineto.h" +#include "CuptiActivityBuffer.h" + +namespace KINETO_NAMESPACE { + +struct ActivityBuffers { + std::list> cpu; + std::unique_ptr gpu; + + // Add a wrapper object to the underlying struct stored in the buffer + template + const ITraceActivity& addActivityWrapper(const T& act) { + wrappers_.push_back(std::make_unique(act)); + return *wrappers_.back().get(); + } + + private: + std::vector> wrappers_; +}; + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/ActivityLoggerFactory.h b/tb_plugins/profiling/libkineto/src/ActivityLoggerFactory.h new file mode 100644 index 0000000000000000000000000000000000000000..0d1bf642cd68051e487004d33e19c5eb181e1c41 --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/ActivityLoggerFactory.h @@ -0,0 +1,60 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace KINETO_NAMESPACE { + +class ActivityLogger; + +class ActivityLoggerFactory { + + public: + using FactoryFunc = + std::function(const std::string& url)>; + + // Add logger factory for a protocol prefix + void addProtocol(const std::string& protocol, FactoryFunc f) { + factories_[tolower(protocol)] = f; + } + + // Create a logger, invoking the factory for the protocol specified in url + std::unique_ptr makeLogger(const std::string& url) const { + std::string protocol = extractProtocol(url); + auto it = factories_.find(tolower(protocol)); + if (it != factories_.end()) { + return it->second(stripProtocol(url)); + } + throw std::invalid_argument(fmt::format( + "No logger registered for the {} protocol prefix", + protocol)); + return nullptr; + } + + private: + static std::string tolower(std::string s) { + std::transform(s.begin(), s.end(), s.begin(), + [](unsigned char c) { return std::tolower(c); } + ); + return s; + } + + static std::string extractProtocol(std::string url) { + return url.substr(0, url.find("://")); + } + + static std::string stripProtocol(std::string url) { + size_t pos = url.find("://"); + return pos == url.npos ? url : url.substr(pos + 3); + } + + std::map factories_; +}; + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/ActivityProfilerController.cpp b/tb_plugins/profiling/libkineto/src/ActivityProfilerController.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c85d41ed73ff059bcd7ee69c36a0bcc6c3d5c4ca --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/ActivityProfilerController.cpp @@ -0,0 +1,246 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include "ActivityProfilerController.h" + +#include +#include + +#include "ActivityLoggerFactory.h" +#include "ActivityTrace.h" +#include "CuptiActivityApi.h" +#ifdef HAS_ROCTRACER +#include "RoctracerActivityApi.h" +#endif +#include "ThreadUtil.h" +#include "output_json.h" +#include "output_membuf.h" + +#include "Logger.h" + +using namespace std::chrono; + +namespace KINETO_NAMESPACE { + +constexpr milliseconds kProfilerIntervalMsecs(1000); + +ActivityProfilerController::ActivityProfilerController( + ConfigLoader& configLoader, bool cpuOnly) + : configLoader_(configLoader) { +#ifdef HAS_ROCTRACER + profiler_ = std::make_unique( + RoctracerActivityApi::singleton(), cpuOnly); +#else + profiler_ = std::make_unique( + CuptiActivityApi::singleton(), cpuOnly); +#endif + configLoader_.addHandler(ConfigLoader::ConfigKind::ActivityProfiler, this); +} + +ActivityProfilerController::~ActivityProfilerController() { + configLoader_.removeHandler( + ConfigLoader::ConfigKind::ActivityProfiler, this); + if (profilerThread_) { + // signaling termination of the profiler loop + stopRunloop_ = true; + profilerThread_->join(); + delete profilerThread_; + profilerThread_ = nullptr; + } +} + +static ActivityLoggerFactory initLoggerFactory() { + ActivityLoggerFactory factory; + factory.addProtocol("file", [](const std::string& url) { + return std::unique_ptr(new ChromeTraceLogger(url)); + }); + return factory; +} + +static ActivityLoggerFactory& loggerFactory() { + static ActivityLoggerFactory factory = initLoggerFactory(); + return factory; +} + +void ActivityProfilerController::addLoggerFactory( + const std::string& protocol, ActivityLoggerFactory::FactoryFunc factory) { + loggerFactory().addProtocol(protocol, factory); +} + +static std::unique_ptr makeLogger(const Config& config) { + if (config.activitiesLogToMemory()) { + return std::make_unique(config); + } + return loggerFactory().makeLogger(config.activitiesLogUrl()); +} + +bool ActivityProfilerController::canAcceptConfig() { + return !profiler_->isActive(); +} + +void ActivityProfilerController::acceptConfig(const Config& config) { + VLOG(1) << "acceptConfig"; + if (config.activityProfilerEnabled()) { + scheduleTrace(config); + } +} + +void ActivityProfilerController::profilerLoop() { + setThreadName("Kineto Activity Profiler"); + VLOG(0) << "Entering activity profiler loop"; + + auto now = system_clock::now(); + auto next_wakeup_time = now + kProfilerIntervalMsecs; + + while (!stopRunloop_) { + now = system_clock::now(); + + while (now < next_wakeup_time) { + /* sleep override */ + std::this_thread::sleep_for(next_wakeup_time - now); + now = system_clock::now(); + } + + if (!profiler_->isActive()) { + std::lock_guard lock(asyncConfigLock_); + if (asyncRequestConfig_ + && !asyncRequestConfig_->hasProfileStartIteration()) { + // Note on now + kProfilerIntervalMsecs + // Profiler interval does not align perfectly upto startTime - warmup. Waiting until the next tick + // won't allow sufficient time for the profiler to warm up. So check if we are very close to the warmup time and trigger warmup + if (now + kProfilerIntervalMsecs + >= (asyncRequestConfig_->requestTimestamp() - asyncRequestConfig_->activitiesWarmupDuration())) { + LOG(INFO) << "Received on-demand activity trace request by " + << " profile timestamp = " + << asyncRequestConfig_-> + requestTimestamp().time_since_epoch().count(); + activateConfig(now); + } + } + } + + while (next_wakeup_time < now) { + next_wakeup_time += kProfilerIntervalMsecs; + } + + if (profiler_->isActive()) { + next_wakeup_time = profiler_->performRunLoopStep(now, next_wakeup_time); + VLOG(1) << "Profiler loop: " + << duration_cast(system_clock::now() - now).count() + << "ms"; + } + } + + VLOG(0) << "Exited activity profiling loop"; +} + +void ActivityProfilerController::step() { + int64_t currentIter = ++iterationCount_; + VLOG(0) << "Step called , iteration = " << currentIter; + + // optimization to not take the lock unless necessary + if (asyncRequestConfig_ && !profiler_->isActive()) { + std::lock_guard lock(asyncConfigLock_); + auto startIter = asyncRequestConfig_->startIterationIncludingWarmup(); + + if (asyncRequestConfig_->hasProfileStartIteration() + && currentIter >= startIter) { + LOG(INFO) << "Received on-demand activity trace request by profile" + << " start iteration = " + << asyncRequestConfig_->profileStartIteration() + << " current iteration = " << currentIter; + + if (currentIter > startIter) { + // adjust the start iteration if it is in the past + auto newProfileStart = currentIter + + asyncRequestConfig_->activitiesWarmupIterations(); + LOG(INFO) << "Start iteration updated to " << newProfileStart; + asyncRequestConfig_->setProfileStartIteration(newProfileStart); + } + activateConfig(system_clock::now()); + } + } + + if (profiler_->isActive()) { + auto now = system_clock::now(); + auto next_wakeup_time = now + kProfilerIntervalMsecs; + profiler_->performRunLoopStep(now, next_wakeup_time, currentIter); + } +} + +void ActivityProfilerController::activateConfig( + std::chrono::time_point now) { + logger_ = makeLogger(*asyncRequestConfig_); + profiler_->setLogger(logger_.get()); + profiler_->configure(*asyncRequestConfig_, now); + asyncRequestConfig_ = nullptr; +} + +void ActivityProfilerController::scheduleTrace(const Config& config) { + VLOG(1) << "scheduleTrace"; + if (profiler_->isActive()) { + LOG(ERROR) << "Ignored request - profiler busy"; + return; + } + int64_t currentIter = iterationCount_; + if (config.hasProfileStartIteration() && currentIter < 0) { + LOG(ERROR) << "Ignored profile iteration count based request as " + << "application is not updating iteration count"; + return; + } + std::lock_guard lock(asyncConfigLock_); + asyncRequestConfig_ = config.clone(); + + auto startIter = asyncRequestConfig_->startIterationIncludingWarmup(); + + if (asyncRequestConfig_->hasProfileStartIteration() + && (currentIter > startIter) + && asyncRequestConfig_->profileStartIterationRoundUp() > 0) { + auto newProfileStart + = currentIter + asyncRequestConfig_->activitiesWarmupIterations(); + // round up to nearest multiple + auto divisor = asyncRequestConfig_->profileStartIterationRoundUp(); + auto rem = newProfileStart % divisor; + newProfileStart += ((rem == 0) ? 0 : divisor - rem); + LOG(INFO) << "Rounding up profiler start iteration to : " << newProfileStart; + asyncRequestConfig_->setProfileStartIteration(newProfileStart); + } + + // start a profilerLoop() thread to handle request + if (!profilerThread_) { + profilerThread_ = + new std::thread(&ActivityProfilerController::profilerLoop, this); + } +} + +void ActivityProfilerController::prepareTrace(const Config& config) { + // Requests from ActivityProfilerApi have higher priority than + // requests from other sources (signal, daemon). + // Cancel any ongoing request and refuse new ones. + auto now = system_clock::now(); + if (profiler_->isActive()) { + LOG(WARNING) << "Cancelling current trace request in order to start " + << "higher priority synchronous request"; + if (libkineto::api().client()) { + libkineto::api().client()->stop(); + } + profiler_->stopTrace(now); + profiler_->reset(); + } + + profiler_->configure(config, now); +} + +std::unique_ptr ActivityProfilerController::stopTrace() { + profiler_->stopTrace(std::chrono::system_clock::now()); + auto logger = std::make_unique(profiler_->config()); + profiler_->processTrace(*logger); + profiler_->reset(); + return std::make_unique(std::move(logger), loggerFactory()); +} + +void ActivityProfilerController::addMetadata( + const std::string& key, const std::string& value) { + profiler_->addMetadata(key, value); +} + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/ActivityProfilerController.h b/tb_plugins/profiling/libkineto/src/ActivityProfilerController.h new file mode 100644 index 0000000000000000000000000000000000000000..415f107cbed6aab4777c65e9e51d65686002e762 --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/ActivityProfilerController.h @@ -0,0 +1,84 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#include +#include +#include +#include +#include + +#include "ActivityLoggerFactory.h" +#include "CuptiActivityProfiler.h" +#include "ActivityProfilerInterface.h" +#include "ActivityTraceInterface.h" +#include "ConfigLoader.h" +#include "CuptiActivityApi.h" + +namespace KINETO_NAMESPACE { + +class Config; + +class ActivityProfilerController : public ConfigLoader::ConfigHandler { + public: + explicit ActivityProfilerController(ConfigLoader& configLoader, bool cpuOnly); + ActivityProfilerController(const ActivityProfilerController&) = delete; + ActivityProfilerController& operator=(const ActivityProfilerController&) = + delete; + + ~ActivityProfilerController(); + + static void addLoggerFactory( + const std::string& protocol, + ActivityLoggerFactory::FactoryFunc factory); + + bool canAcceptConfig() override; + void acceptConfig(const Config& config) override; + + void scheduleTrace(const Config& config); + + void prepareTrace(const Config& config); + + void startTrace() { + profiler_->startTrace(std::chrono::system_clock::now()); + } + + void step(); + + std::unique_ptr stopTrace(); + + bool isActive() { + return profiler_->isActive(); + } + + void transferCpuTrace( + std::unique_ptr cpuTrace) { + return profiler_->transferCpuTrace(std::move(cpuTrace)); + } + + void recordThreadInfo() { + profiler_->recordThreadInfo(); + } + + void addChildActivityProfiler( + std::unique_ptr profiler) { + profiler_->addChildActivityProfiler(std::move(profiler)); + } + + void addMetadata(const std::string& key, const std::string& value); + + private: + void profilerLoop(); + void activateConfig(std::chrono::time_point now); + + std::unique_ptr asyncRequestConfig_; + std::mutex asyncConfigLock_; + std::unique_ptr profiler_; + std::unique_ptr logger_; + std::thread* profilerThread_{nullptr}; + std::atomic_bool stopRunloop_{false}; + std::atomic iterationCount_{-1}; + ConfigLoader& configLoader_; +}; + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/ActivityProfilerProxy.cpp b/tb_plugins/profiling/libkineto/src/ActivityProfilerProxy.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b2d36b7b3abf9c3e0aed838a10e4054a5d292139 --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/ActivityProfilerProxy.cpp @@ -0,0 +1,119 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include "ActivityProfilerProxy.h" + +#include "ActivityProfilerController.h" +#include "Config.h" +#include "CuptiActivityApi.h" +#include "Logger.h" +#include + +namespace KINETO_NAMESPACE { + +ActivityProfilerProxy::ActivityProfilerProxy( + bool cpuOnly, ConfigLoader& configLoader) + : cpuOnly_(cpuOnly), configLoader_(configLoader) { +} + +ActivityProfilerProxy::~ActivityProfilerProxy() { + delete controller_; +}; + +void ActivityProfilerProxy::init() { + if (!controller_) { + controller_ = new ActivityProfilerController(configLoader_, cpuOnly_); + } +} + +void ActivityProfilerProxy::scheduleTrace(const std::string& configStr) { + Config config; + config.parse(configStr); + controller_->scheduleTrace(config); +} + +void ActivityProfilerProxy::scheduleTrace(const Config& config) { + controller_->scheduleTrace(config); +} + +void ActivityProfilerProxy::prepareTrace( + const std::set& activityTypes, + const std::string& configStr) { + Config config; + bool validate_required = true; + + // allow user provided config to override default options + if (!configStr.empty()) { + if (!config.parse(configStr)) { + LOG(WARNING) << "Failed to parse config : " << configStr; + } + // parse also runs validate + validate_required = false; + } + + config.setClientDefaults(); + config.setSelectedActivityTypes(activityTypes); + + if (validate_required) { + config.validate(std::chrono::system_clock::now()); + } + + controller_->prepareTrace(config); +} + +void ActivityProfilerProxy::startTrace() { + controller_->startTrace(); +} + +std::unique_ptr +ActivityProfilerProxy::stopTrace() { + return controller_->stopTrace(); +} + +void ActivityProfilerProxy::step() { + controller_->step(); +} + +bool ActivityProfilerProxy::isActive() { + return controller_->isActive(); +} + +void ActivityProfilerProxy::pushCorrelationId(uint64_t id) { + CuptiActivityApi::pushCorrelationID(id, + CuptiActivityApi::CorrelationFlowType::Default); +} + +void ActivityProfilerProxy::popCorrelationId() { + CuptiActivityApi::popCorrelationID( + CuptiActivityApi::CorrelationFlowType::Default); +} + +void ActivityProfilerProxy::pushUserCorrelationId(uint64_t id) { + CuptiActivityApi::pushCorrelationID(id, + CuptiActivityApi::CorrelationFlowType::User); +} + +void ActivityProfilerProxy::popUserCorrelationId() { + CuptiActivityApi::popCorrelationID( + CuptiActivityApi::CorrelationFlowType::User); +} + +void ActivityProfilerProxy::transferCpuTrace( + std::unique_ptr traceBuffer) { + controller_->transferCpuTrace(std::move(traceBuffer)); +} + +void ActivityProfilerProxy::addMetadata( + const std::string& key, const std::string& value) { + controller_->addMetadata(key, value); +} + +void ActivityProfilerProxy::recordThreadInfo() { + controller_->recordThreadInfo(); +} + +void ActivityProfilerProxy::addChildActivityProfiler( + std::unique_ptr profiler) { + controller_->addChildActivityProfiler(std::move(profiler)); +} + +} // namespace libkineto diff --git a/tb_plugins/profiling/libkineto/src/ActivityProfilerProxy.h b/tb_plugins/profiling/libkineto/src/ActivityProfilerProxy.h new file mode 100644 index 0000000000000000000000000000000000000000..b5cf84b2f1ddb005060fea0927c99fc63d144d99 --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/ActivityProfilerProxy.h @@ -0,0 +1,73 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#include "ActivityProfilerInterface.h" + +#include +#include +#include + +#include "ActivityType.h" +#include "ITraceActivity.h" + +namespace libkineto { + // previous declaration is struct so this one must be too. + struct CpuTraceBuffer; +} + +namespace KINETO_NAMESPACE { + +using namespace libkineto; + +class ActivityProfilerController; +class Config; +class ConfigLoader; + +class ActivityProfilerProxy : public ActivityProfilerInterface { + + public: + ActivityProfilerProxy(bool cpuOnly, ConfigLoader& configLoader); + ~ActivityProfilerProxy() override; + + void init() override; + bool isInitialized() override { + return controller_ != nullptr; + } + + bool isActive() override; + + void recordThreadInfo() override; + + void scheduleTrace(const std::string& configStr) override; + void scheduleTrace(const Config& config); + + void prepareTrace( + const std::set& activityTypes, + const std::string& configStr = "") override; + + void startTrace() override; + void step() override; + std::unique_ptr stopTrace() override; + + void pushCorrelationId(uint64_t id) override; + void popCorrelationId() override; + + void pushUserCorrelationId(uint64_t id) override; + void popUserCorrelationId() override; + + void transferCpuTrace( + std::unique_ptr traceBuffer) override; + + void addMetadata(const std::string& key, const std::string& value) override; + + virtual void addChildActivityProfiler( + std::unique_ptr profiler) override; + + private: + bool cpuOnly_{true}; + ConfigLoader& configLoader_; + ActivityProfilerController* controller_{nullptr}; +}; + +} // namespace libkineto diff --git a/tb_plugins/profiling/libkineto/src/ActivityTrace.h b/tb_plugins/profiling/libkineto/src/ActivityTrace.h new file mode 100644 index 0000000000000000000000000000000000000000..0be76af08e47c16ebee2ac1d1ad01c4425ff17a5 --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/ActivityTrace.h @@ -0,0 +1,45 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#include +#include + +#include "ActivityLoggerFactory.h" +#include "ActivityTraceInterface.h" +#include "output_json.h" +#include "output_membuf.h" + +namespace libkineto { + +class ActivityTrace : public ActivityTraceInterface { + public: + ActivityTrace( + std::unique_ptr tmpLogger, + const ActivityLoggerFactory& factory) + : memLogger_(std::move(tmpLogger)), + loggerFactory_(factory) { + } + + const std::vector* activities() override { + return memLogger_->traceActivities(); + }; + + void save(const std::string& url) override { + std::string prefix; + // if no protocol is specified, default to file + if (url.find("://") == url.npos) { + prefix = "file://"; + } + memLogger_->log(*loggerFactory_.makeLogger(prefix + url)); + }; + + private: + // Activities are logged into a buffer + std::unique_ptr memLogger_; + + // Alternative logger used by save() if protocol prefix is specified + const ActivityLoggerFactory& loggerFactory_; +}; + +} // namespace libkineto diff --git a/tb_plugins/profiling/libkineto/src/ActivityType.cpp b/tb_plugins/profiling/libkineto/src/ActivityType.cpp new file mode 100644 index 0000000000000000000000000000000000000000..18856b72370abdb6d9cf4309b32be4cae10805de --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/ActivityType.cpp @@ -0,0 +1,58 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include "ActivityType.h" + +#include + +namespace libkineto { + +struct ActivityTypeName { + const char* name; + ActivityType type; +}; + +static constexpr std::array map{{ + {"cpu_op", ActivityType::CPU_OP}, + {"user_annotation", ActivityType::USER_ANNOTATION}, + {"gpu_user_Annotation", ActivityType::GPU_USER_ANNOTATION}, + {"gpu_memcpy", ActivityType::GPU_MEMCPY}, + {"gpu_memset", ActivityType::GPU_MEMSET}, + {"kernel", ActivityType::CONCURRENT_KERNEL}, + {"external_correlation", ActivityType::EXTERNAL_CORRELATION}, + {"cuda_runtime", ActivityType::CUDA_RUNTIME}, + {"cuda_profiler_range", ActivityType::CUDA_PROFILER_RANGE}, + {"glow_runtime", ActivityType::GLOW_RUNTIME}, + {"cpu_instant_event", ActivityType::CPU_INSTANT_EVENT}, + {"python_function", ActivityType::PYTHON_FUNCTION}, + {"overhead", ActivityType::OVERHEAD}, + {"ENUM_COUNT", ActivityType::ENUM_COUNT} +}}; + +static constexpr bool matchingOrder(int idx = 0) { + return map[idx].type == ActivityType::ENUM_COUNT || + ((idx == (int) map[idx].type) && matchingOrder(idx + 1)); +} +static_assert(matchingOrder(), "ActivityTypeName map is out of order"); + +const char* toString(ActivityType t) { + return map[(int)t].name; +} + +ActivityType toActivityType(const std::string& str) { + for (int i = 0; i < activityTypeCount; i++) { + if (str == map[i].name) { + return map[i].type; + } + } + throw std::invalid_argument(fmt::format("Invalid activity type: {}", str)); +} + +const std::array activityTypes() { + std::array res; + for (int i = 0; i < activityTypeCount; i++) { + res[i] = map[i].type; + } + return res; +} + +} // namespace libkineto diff --git a/tb_plugins/profiling/libkineto/src/Config.cpp b/tb_plugins/profiling/libkineto/src/Config.cpp new file mode 100644 index 0000000000000000000000000000000000000000..95538840f378e83b2b44161823042c620b34fe93 --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/Config.cpp @@ -0,0 +1,473 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include "Config.h" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "Logger.h" +#include "ThreadUtil.h" + +using namespace std::chrono; + +using std::string; +using std::vector; + +namespace KINETO_NAMESPACE { + +constexpr milliseconds kDefaultSamplePeriodMsecs(1000); +constexpr milliseconds kDefaultMultiplexPeriodMsecs(1000); +constexpr milliseconds kDefaultActivitiesProfileDurationMSecs(500); +constexpr int kDefaultActivitiesMaxGpuBufferSize(128 * 1024 * 1024); +constexpr seconds kDefaultActivitiesWarmupDurationSecs(5); +constexpr seconds kDefaultBufferUntilWarmup(10); +constexpr seconds kDefaultReportPeriodSecs(1); +constexpr int kDefaultSamplesPerReport(1); +constexpr int kDefaultMaxEventProfilersPerGpu(1); +constexpr int kDefaultEventProfilerHearbeatMonitorPeriod(0); +constexpr seconds kMaxRequestAge(10); + +// Event Profiler +constexpr char kEventsKey[] = "EVENTS"; +constexpr char kMetricsKey[] = "METRICS"; +constexpr char kSamplePeriodKey[] = "SAMPLE_PERIOD_MSECS"; +constexpr char kMultiplexPeriodKey[] = "MULTIPLEX_PERIOD_MSECS"; +constexpr char kReportPeriodKey[] = "REPORT_PERIOD_SECS"; +constexpr char kSamplesPerReportKey[] = "SAMPLES_PER_REPORT"; +constexpr char kEventsLogFileKey[] = "EVENTS_LOG_FILE"; +constexpr char kEventsEnabledDevicesKey[] = "EVENTS_ENABLED_DEVICES"; +constexpr char kOnDemandDurationKey[] = "EVENTS_DURATION_SECS"; +constexpr char kMaxEventProfilersPerGpuKey[] = "MAX_EVENT_PROFILERS_PER_GPU"; +constexpr char kHeartbeatMonitorPeriodKey[] = + "EVENTS_HEARTBEAT_MONITOR_PERIOD_SECS"; + +// Activity Profiler +constexpr char kActivitiesEnabledKey[] = "ACTIVITIES_ENABLED"; +constexpr char kActivityTypesKey[] = "ACTIVITY_TYPES"; +constexpr char kActivitiesLogFileKey[] = "ACTIVITIES_LOG_FILE"; +constexpr char kActivitiesDurationKey[] = "ACTIVITIES_DURATION_SECS"; +constexpr char kActivitiesDurationMsecsKey[] = "ACTIVITIES_DURATION_MSECS"; +constexpr char kActivitiesWarmupDurationSecsKey[] = "ACTIVITIES_WARMUP_PERIOD_SECS"; +constexpr char kActivitiesMaxGpuBufferSizeKey[] = + "ACTIVITIES_MAX_GPU_BUFFER_SIZE_MB"; + +// Client Interface +constexpr char kClientInterfaceEnableOpInputsCollection[] = "CLIENT_INTERFACE_ENABLE_OP_INPUTS_COLLECTION"; + +constexpr char kActivitiesWarmupIterationsKey[] = "ACTIVITIES_WARMUP_ITERATIONS"; +constexpr char kActivitiesIterationsKey[] = "ACTIVITIES_ITERATIONS"; +// Common + +// Client-side timestamp used for synchronized start across hosts for +// distributed workloads. +// Specified in milliseconds Unix time (milliseconds since epoch). +// To use, compute a future timestamp as follows: +// * C++: + duration_cast( +// system_clock::now().time_since_epoch()).count() +// * Python: + int(time.time() * 1000) +// * Bash: $(( + $(date +%s%3N))) +// If used for a tracing request, timestamp must be far enough in the future +// to accommodate ACTIVITIES_WARMUP_PERIOD_SECS as well as any delays in +// propagating the request to the profiler. +// If the request can not be honored, it is up to the profilers to report +// an error somehow - no checks are done at config parse time. +// Note PROFILE_START_ITERATION has higher precedence +constexpr char kProfileStartTimeKey[] = "PROFILE_START_TIME"; +// DEPRECATED - USE PROFILE_START_TIME instead +constexpr char kRequestTimestampKey[] = "REQUEST_TIMESTAMP"; + +// Alternatively if the application supports reporting iterations +// start the profile at specific iteration. If the iteration count +// is >= this value the profile is started immediately. +// A value >= 0 is valid for this config option to take effect. +// Note PROFILE_START_ITERATION will take precedence over PROFILE_START_TIME. +constexpr char kProfileStartIterationKey[] = "PROFILE_START_ITERATION"; + +// Users can also start the profile on an integer multiple of the config +// value PROFILE_START_ITERATION_ROUNDUP. This knob behaves similar to +// PROFILE_START_ITERATION but instead of saying : "start collection trace on +// iteration 500", one can configure it to "start collecting trace on the next +// 100th iteration". +// +// For example, +// PROFILE_START_ITERATION_ROUNDUP = 1000, and the current iteration is 2010 +// The profile will then be collected on the next multiple of 1000 ie. 3000 +// Note PROFILE_START_ITERATION_ROUNDUP will also take precedence over +// PROFILE_START_TIME. +constexpr char kProfileStartIterationRoundUpKey[] + = "PROFILE_START_ITERATION_ROUNDUP"; + +// Enable on-demand trigger via kill -USR2 +// When triggered in this way, /tmp/libkineto.conf will be used as config. +constexpr char kEnableSigUsr2Key[] = "ENABLE_SIGUSR2"; + +// Enable communication through IPC Fabric +// and disable thrift communication with dynolog daemon +constexpr char kEnableIpcFabricKey[] = "ENABLE_IPC_FABRIC"; + +// Verbose log level +// The actual glog is not used and --v and --vmodule has no effect. +// Instead set the verbose level and modules in the config file. +constexpr char kLogVerboseLevelKey[] = "VERBOSE_LOG_LEVEL"; +// By default, all modules will log verbose messages >= verboseLogLevel. +// But to reduce noise we can specify one or more modules of interest. +// A module is a C/C++ object file (source file name), +// Example argument: ActivityProfiler.cpp,output_json.cpp +constexpr char kLogVerboseModulesKey[] = "VERBOSE_LOG_MODULES"; + +// Max devices supported on any system +constexpr uint8_t kMaxDevices = 8; + +namespace { + +struct FactoryMap { + + void addFactory( + std::string name, + std::function factory) { + std::lock_guard lock(lock_); + factories_[name] = factory; + } + + void addFeatureConfigs(Config& cfg) { + std::lock_guard lock(lock_); + for (const auto& p : factories_) { + cfg.addFeature(p.first, p.second(cfg)); + } + } + +// Config factories are shared between objects and since +// config objects can be created by multiple threads, we need a lock. + std::mutex lock_; + std::map> factories_; +}; + +std::shared_ptr configFactories() { + // Ensure this is safe to call during shutdown, even as static + // destructors are invoked. Once factories destructor has been + // invoked, weak_ptr.lock() will return nullptr. + // But calls before that point will have a valid shared_ptr, + // delaying destruction of the underlying FactoryMap. + static auto factories = std::make_shared(); + static std::weak_ptr weak_ptr = factories; + return weak_ptr.lock(); +} + +} // namespace + +void Config::addConfigFactory( + std::string name, + std::function factory) { + auto factories = configFactories(); + if (factories) { + factories->addFactory(name, factory); + } +} + +static string defaultTraceFileName() { + return fmt::format("/tmp/libkineto_activities_{}.json", processId()); +} + +Config::Config() + : verboseLogLevel_(-1), + samplePeriod_(kDefaultSamplePeriodMsecs), + reportPeriod_(duration_cast(kDefaultReportPeriodSecs)), + samplesPerReport_(kDefaultSamplesPerReport), + eventProfilerOnDemandDuration_(seconds(0)), + eventProfilerMaxInstancesPerGpu_(kDefaultMaxEventProfilersPerGpu), + eventProfilerHeartbeatMonitorPeriod_( + kDefaultEventProfilerHearbeatMonitorPeriod), + multiplexPeriod_(kDefaultMultiplexPeriodMsecs), + activityProfilerEnabled_(true), + activitiesLogFile_(defaultTraceFileName()), + activitiesLogUrl_(fmt::format("file://{}", activitiesLogFile_)), + activitiesMaxGpuBufferSize_(kDefaultActivitiesMaxGpuBufferSize), + activitiesWarmupDuration_(kDefaultActivitiesWarmupDurationSecs), + activitiesWarmupIterations_(0), + activitiesDuration_(kDefaultActivitiesProfileDurationMSecs), + activitiesRunIterations_(0), + activitiesOnDemandTimestamp_(milliseconds(0)), + profileStartTime_(milliseconds(0)), + profileStartIteration_(-1), + profileStartIterationRoundUp_(-1), + requestTimestamp_(milliseconds(0)), + enableSigUsr2_(false), + enableIpcFabric_(false) { + auto factories = configFactories(); + if (factories) { + factories->addFeatureConfigs(*this); + } +} + +uint8_t Config::createDeviceMask(const string& val) { + uint8_t res = 0; + for (const auto& d : splitAndTrim(val, ',')) { + res |= 1 << toIntRange(d, 0, kMaxDevices - 1); + } + return res; +} + +const seconds Config::maxRequestAge() const { + return kMaxRequestAge; +} + +static std::string getTimeStr(time_point t) { + std::time_t t_c = system_clock::to_time_t(t); + return fmt::format("{:%H:%M:%S}", fmt::localtime(t_c)); +} + +static time_point handleRequestTimestamp(int64_t ms) { + auto t = time_point(milliseconds(ms)); + auto now = system_clock::now(); + if (t > now) { + throw std::invalid_argument(fmt::format( + "Invalid {}: {} - time is in future", + kRequestTimestampKey, + getTimeStr(t))); + } else if ((now - t) > kMaxRequestAge) { + throw std::invalid_argument(fmt::format( + "Invalid {}: {} - time is more than {}s in the past", + kRequestTimestampKey, + getTimeStr(t), + kMaxRequestAge.count())); + } + return t; +} + +void Config::setActivityTypes( + const std::vector& selected_activities) { + selectedActivityTypes_.clear(); + if (selected_activities.size() > 0) { + for (const auto& activity : selected_activities) { + if (activity == "") { + continue; + } + selectedActivityTypes_.insert(toActivityType(activity)); + } + } +} + +bool Config::handleOption(const std::string& name, std::string& val) { + // Event Profiler + if (!name.compare(kEventsKey)) { + vector event_names = splitAndTrim(val, ','); + eventNames_.insert(event_names.begin(), event_names.end()); + } else if (!name.compare(kMetricsKey)) { + vector metric_names = splitAndTrim(val, ','); + metricNames_.insert(metric_names.begin(), metric_names.end()); + } else if (!name.compare(kSamplePeriodKey)) { + samplePeriod_ = milliseconds(toInt32(val)); + } else if (!name.compare(kMultiplexPeriodKey)) { + multiplexPeriod_ = milliseconds(toInt32(val)); + } else if (!name.compare(kReportPeriodKey)) { + setReportPeriod(seconds(toInt32(val))); + } else if (!name.compare(kSamplesPerReportKey)) { + samplesPerReport_ = toInt32(val); + } else if (!name.compare(kEventsLogFileKey)) { + eventLogFile_ = val; + } else if (!name.compare(kEventsEnabledDevicesKey)) { + eventProfilerDeviceMask_ = createDeviceMask(val); + } else if (!name.compare(kOnDemandDurationKey)) { + eventProfilerOnDemandDuration_ = seconds(toInt32(val)); + eventProfilerOnDemandTimestamp_ = timestamp(); + } else if (!name.compare(kMaxEventProfilersPerGpuKey)) { + eventProfilerMaxInstancesPerGpu_ = toInt32(val); + } else if (!name.compare(kHeartbeatMonitorPeriodKey)) { + eventProfilerHeartbeatMonitorPeriod_ = seconds(toInt32(val)); + } + + // Activity Profiler + else if (!name.compare(kActivitiesDurationKey)) { + activitiesDuration_ = + duration_cast(seconds(toInt32(val))); + activitiesOnDemandTimestamp_ = timestamp(); + } else if (!name.compare(kActivityTypesKey)) { + vector activity_types = splitAndTrim(toLower(val), ','); + setActivityTypes(activity_types); + } else if (!name.compare(kActivitiesDurationMsecsKey)) { + activitiesDuration_ = milliseconds(toInt32(val)); + activitiesOnDemandTimestamp_ = timestamp(); + } else if (!name.compare(kActivitiesIterationsKey)) { + activitiesRunIterations_ = toInt32(val); + activitiesOnDemandTimestamp_ = timestamp(); + } else if (!name.compare(kLogVerboseLevelKey)) { + verboseLogLevel_ = toInt32(val); + } else if (!name.compare(kLogVerboseModulesKey)) { + verboseLogModules_ = splitAndTrim(val, ','); + } else if (!name.compare(kActivitiesEnabledKey)) { + activityProfilerEnabled_ = toBool(val); + } else if (!name.compare(kActivitiesLogFileKey)) { + activitiesLogFile_ = val; + activitiesLogUrl_ = fmt::format("file://{}", val); + activitiesOnDemandTimestamp_ = timestamp(); + } else if (!name.compare(kActivitiesMaxGpuBufferSizeKey)) { + activitiesMaxGpuBufferSize_ = toInt32(val) * 1024 * 1024; + } else if (!name.compare(kActivitiesWarmupDurationSecsKey)) { + activitiesWarmupDuration_ = seconds(toInt32(val)); + } else if (!name.compare(kActivitiesWarmupIterationsKey)) { + activitiesWarmupIterations_ = toInt32(val); + } + + // Client Interface + else if (!name.compare(kClientInterfaceEnableOpInputsCollection)) { + enableOpInputsCollection_ = toBool(val); + } + + // Common + else if (!name.compare(kRequestTimestampKey)) { + VLOG(0) << kRequestTimestampKey + << " has been deprecated - please use " + << kProfileStartTimeKey; + requestTimestamp_ = handleRequestTimestamp(toInt64(val)); + } else if (!name.compare(kProfileStartTimeKey)) { + profileStartTime_ = + time_point(milliseconds(toInt64(val))); + } else if (!name.compare(kProfileStartIterationKey)) { + profileStartIteration_ = toInt32(val); + } else if (!name.compare(kProfileStartIterationRoundUpKey)) { + profileStartIterationRoundUp_ = toInt32(val); + } else if (!name.compare(kEnableSigUsr2Key)) { + enableSigUsr2_ = toBool(val); + } else if (!name.compare(kEnableIpcFabricKey)) { + enableIpcFabric_ = toBool(val); + } else { + return false; + } + return true; +} + +std::chrono::milliseconds Config::activitiesDurationDefault() const { + return kDefaultActivitiesProfileDurationMSecs; +}; + +void Config::updateActivityProfilerRequestReceivedTime() { + activitiesOnDemandTimestamp_ = system_clock::now(); +} + +void Config::setClientDefaults() { + AbstractConfig::setClientDefaults(); + activitiesLogToMemory_ = true; +} + +void Config::validate( + const time_point& fallbackProfileStartTime) { + if (samplePeriod_.count() == 0) { + LOG(WARNING) << "Sample period must be greater than 0, setting to 1ms"; + samplePeriod_ = milliseconds(1); + } + + if (multiplexPeriod_ < samplePeriod_) { + LOG(WARNING) << "Multiplex period can not be smaller " + << "than sample period"; + LOG(WARNING) << "Setting multiplex period to " << samplePeriod_.count() + << "ms"; + multiplexPeriod_ = samplePeriod_; + } + + if ((multiplexPeriod_ % samplePeriod_).count() != 0) { + LOG(WARNING) << "Multiplex period must be a " + << "multiple of sample period"; + multiplexPeriod_ = alignUp(multiplexPeriod_, samplePeriod_); + LOG(WARNING) << "Setting multiplex period to " << multiplexPeriod_.count() + << "ms"; + } + + if ((reportPeriod_ % multiplexPeriod_).count() != 0 || + reportPeriod_.count() == 0) { + LOG(WARNING) << "Report period must be a " + << "multiple of multiplex period"; + reportPeriod_ = alignUp(reportPeriod_, multiplexPeriod_); + LOG(WARNING) << "Setting report period to " << reportPeriod_.count() + << "ms"; + } + + if (samplesPerReport_ < 1) { + LOG(WARNING) << "Samples per report must be in the range " + << "[1, report period / sample period]"; + LOG(WARNING) << "Setting samples per report to 1"; + samplesPerReport_ = 1; + } + + int max_samples_per_report = reportPeriod_ / samplePeriod_; + if (samplesPerReport_ > max_samples_per_report) { + LOG(WARNING) << "Samples per report must be in the range " + << "[1, report period / sample period] ([1, " + << reportPeriod_.count() << "ms / " << samplePeriod_.count() + << "ms = " << max_samples_per_report << "])"; + LOG(WARNING) << "Setting samples per report to " << max_samples_per_report; + samplesPerReport_ = max_samples_per_report; + } + + if (!hasProfileStartTime()) { + VLOG(0) + << "No explicit timestamp has been set. " + << "Defaulting it to now + activitiesWarmupDuration with buffer."; + profileStartTime_ = fallbackProfileStartTime + + activitiesWarmupDuration() + kDefaultBufferUntilWarmup; + } + + if (profileStartIterationRoundUp_ == 0) { + // setting to 0 will mess up modulo arithmetic, set it to -1 so it has no effect + LOG(WARNING) << "Profiler start iteration round up should be >= 1."; + profileStartIterationRoundUp_ = -1; + } + + if (profileStartIterationRoundUp_ > 0 && !hasProfileStartIteration()) { + VLOG(0) << "Setting profiler start iteration to 0 so this config is " + << "triggered via iteration count."; + profileStartIteration_ = 0; + } + + if (selectedActivityTypes_.size() == 0) { + selectDefaultActivityTypes(); + } +} + +void Config::setReportPeriod(milliseconds msecs) { + reportPeriod_ = msecs; +} + +void Config::printActivityProfilerConfig(std::ostream& s) const { + s << "Log file: " << activitiesLogFile() << std::endl; + if (hasProfileStartIteration()) { + s << "Trace start Iteration: " << profileStartIteration() << std::endl; + s << "Trace warmup Iterations: " << activitiesWarmupIterations() << std::endl; + s << "Trace profile Iterations: " << activitiesRunIterations() << std::endl; + if (profileStartIterationRoundUp() > 0) { + s << "Trace start iteration roundup : " << profileStartIterationRoundUp() + << std::endl; + } + } else if (hasProfileStartTime()) { + std::time_t t_c = system_clock::to_time_t(requestTimestamp()); + LOG(INFO) << "Trace start time: " + << fmt::format("{:%Y-%m-%d %H:%M:%S}", fmt::localtime(t_c)); + s << "Trace duration: " << activitiesDuration().count() << "ms" + << std::endl; + s << "Warmup duration: " << activitiesWarmupDuration().count() << "s" + << std::endl; + } + + s << "Max GPU buffer size: " << activitiesMaxGpuBufferSize() / 1024 / 1024 + << "MB" << std::endl; + + std::vector activities; + for (const auto& activity : selectedActivityTypes_) { + activities.push_back(toString(activity)); + } + s << "Enabled activities: " + << fmt::format("{}", fmt::join(activities, ",")) << std::endl; + + AbstractConfig::printActivityProfilerConfig(s); +} + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/ConfigLoader.cpp b/tb_plugins/profiling/libkineto/src/ConfigLoader.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4080b678d371e98757897d4d7726c159887377e1 --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/ConfigLoader.cpp @@ -0,0 +1,300 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include "ConfigLoader.h" + +#ifdef __linux__ +#include +#endif + +#include +#include +#include +#include +#include + +#include "DaemonConfigLoader.h" + +#include "Logger.h" + +using namespace std::chrono; +using std::string; + +namespace KINETO_NAMESPACE { + +using namespace libkineto; + +constexpr char kConfigFileEnvVar[] = "KINETO_CONFIG"; +#ifdef __linux__ +constexpr char kConfigFile[] = "/etc/libkineto.conf"; +constexpr char kOnDemandConfigFile[] = "/tmp/libkineto.conf"; +#else +constexpr char kConfigFile[] = "libkineto.conf"; +constexpr char kOnDemandConfigFile[] = "libkineto.conf"; +#endif + +constexpr std::chrono::seconds kConfigUpdateIntervalSecs(300); +constexpr std::chrono::seconds kOnDemandConfigUpdateIntervalSecs(5); + +#ifdef __linux__ +static struct sigaction originalUsr2Handler = {}; +#endif + +// Use SIGUSR2 to initiate profiling. +// Look for an on-demand config file. +// If none is found, default to base config. +// Try to not affect existing handlers +static bool hasOriginalSignalHandler() { +#ifdef __linux__ + return originalUsr2Handler.sa_handler != nullptr || + originalUsr2Handler.sa_sigaction != nullptr; +#else + return false; +#endif +} + +static void handle_signal(int signal) { +#ifdef __linux__ + if (signal == SIGUSR2) { + ConfigLoader::instance().handleOnDemandSignal(); + if (hasOriginalSignalHandler()) { + // Invoke original handler and reinstate ours + struct sigaction act; + sigaction(SIGUSR2, &originalUsr2Handler, &act); + raise(SIGUSR2); + sigaction(SIGUSR2, &act, &originalUsr2Handler); + } + } +#endif +} + +static void setupSignalHandler(bool enableSigUsr2) { +#ifdef __linux__ + if (enableSigUsr2) { + struct sigaction act = {}; + act.sa_handler = &handle_signal; + act.sa_flags = SA_NODEFER; + if (sigaction(SIGUSR2, &act, &originalUsr2Handler) < 0) { + PLOG(ERROR) << "Failed to register SIGUSR2 handler"; + } + if (originalUsr2Handler.sa_handler == &handle_signal) { + originalUsr2Handler = {}; + } + } else if (hasOriginalSignalHandler()) { + sigaction(SIGUSR2, &originalUsr2Handler, nullptr); + originalUsr2Handler = {}; + } +#endif +} + +// return an empty string if reading gets any errors. Otherwise a config string. +static std::string readConfigFromConfigFile(const char* filename) { + // Read whole file into a string. + std::ifstream file(filename); + std::string conf; + try { + conf.assign( + std::istreambuf_iterator(file), std::istreambuf_iterator()); + } catch (std::exception& e) { + VLOG(0) << "Error reading " << filename << ": " + << e.what(); + conf = ""; + } + return conf; +} + +static std::function()>& +daemonConfigLoaderFactory() { + static std::function()> factory = nullptr; + return factory; +} + +void ConfigLoader::setDaemonConfigLoaderFactory( + std::function()> factory) { + daemonConfigLoaderFactory() = factory; +} + +ConfigLoader& ConfigLoader::instance() { + static ConfigLoader config_loader; + return config_loader; +} + +// return an empty string if polling gets any errors. Otherwise a config string. +std::string ConfigLoader::readOnDemandConfigFromDaemon( + time_point now) { + if (!daemonConfigLoader_) { + return ""; + } + bool events = canHandlerAcceptConfig(ConfigKind::EventProfiler); + bool activities = canHandlerAcceptConfig(ConfigKind::ActivityProfiler); + return daemonConfigLoader_->readOnDemandConfig(events, activities); +} + +int ConfigLoader::contextCountForGpu(uint32_t device) { + if (!daemonConfigLoader_) { + // FIXME: Throw error? + return 0; + } + return daemonConfigLoader_->gpuContextCount(device); +} + +ConfigLoader::ConfigLoader() + : configUpdateIntervalSecs_(kConfigUpdateIntervalSecs), + onDemandConfigUpdateIntervalSecs_(kOnDemandConfigUpdateIntervalSecs), + stopFlag_(false), + onDemandSignal_(false) { +} + +void ConfigLoader::startThread() { + if (!updateThread_) { + // Create default base config here - at this point static initializers + // of extensions should have run and registered all config feature factories + std::lock_guard lock(configLock_); + if (!config_) { + config_ = std::make_unique(); + } + updateThread_ = + std::make_unique(&ConfigLoader::updateConfigThread, this); + } +} + +ConfigLoader::~ConfigLoader() { + if (updateThread_) { + stopFlag_ = true; + { + std::lock_guard lock(updateThreadMutex_); + updateThreadCondVar_.notify_one(); + } + updateThread_->join(); + } +#if !USE_GOOGLE_LOG + Logger::clearLoggerObservers(); +#endif // !USE_GOOGLE_LOG +} + +void ConfigLoader::handleOnDemandSignal() { + onDemandSignal_ = true; + { + std::lock_guard lock(updateThreadMutex_); + updateThreadCondVar_.notify_one(); + } +} + +const char* ConfigLoader::configFileName() { + if (!configFileName_) { + configFileName_ = getenv(kConfigFileEnvVar); + if (configFileName_ == nullptr) { + configFileName_ = kConfigFile; + } + } + return configFileName_; +} + +DaemonConfigLoader* ConfigLoader::daemonConfigLoader() { + if (!daemonConfigLoader_ && daemonConfigLoaderFactory()) { + daemonConfigLoader_ = daemonConfigLoaderFactory()(); + daemonConfigLoader_->setCommunicationFabric(config_->ipcFabricEnabled()); + } + return daemonConfigLoader_.get(); +} + +void ConfigLoader::updateBaseConfig() { + // First try reading local config file + // If that fails, read from daemon + // TODO: Invert these once daemon path fully rolled out + std::string config_str = readConfigFromConfigFile(configFileName()); + if (config_str.empty() && daemonConfigLoader()) { + // If local config file was not successfully loaded (e.g. not found) + // then try the daemon + config_str = daemonConfigLoader()->readBaseConfig(); + } + if (config_str != config_->source()) { + std::lock_guard lock(configLock_); + config_ = std::make_unique(); + config_->parse(config_str); + if (daemonConfigLoader()) { + daemonConfigLoader()->setCommunicationFabric(config_->ipcFabricEnabled()); + } + setupSignalHandler(config_->sigUsr2Enabled()); + SET_LOG_VERBOSITY_LEVEL( + config_->verboseLogLevel(), + config_->verboseLogModules()); + VLOG(0) << "Detected base config change"; + } +} + +void ConfigLoader::configureFromSignal( + time_point now, + Config& config) { + LOG(INFO) << "Received on-demand profiling signal, " + << "reading config from " << kOnDemandConfigFile; + // Reset start time to 0 in order to compute new default start time + const std::string config_str = "PROFILE_START_TIME=0\n" + + readConfigFromConfigFile(kOnDemandConfigFile); + config.parse(config_str); + config.setSignalDefaults(); + notifyHandlers(config); +} + +void ConfigLoader::configureFromDaemon( + time_point now, + Config& config) { + const std::string config_str = readOnDemandConfigFromDaemon(now); + if (config_str.empty()) { + return; + } + + LOG(INFO) << "Received config from dyno:\n" << config_str; + config.parse(config_str); + notifyHandlers(config); +} + +void ConfigLoader::updateConfigThread() { + auto now = system_clock::now(); + auto next_config_load_time = now; + auto next_on_demand_load_time = now + onDemandConfigUpdateIntervalSecs_; + seconds interval = configUpdateIntervalSecs_; + if (interval > onDemandConfigUpdateIntervalSecs_) { + interval = onDemandConfigUpdateIntervalSecs_; + } + auto onDemandConfig = std::make_unique(); + + // This can potentially sleep for long periods of time, so allow + // the desctructor to wake it to avoid a 5-minute long destruct period. + for (;;) { + { + std::unique_lock lock(updateThreadMutex_); + updateThreadCondVar_.wait_for(lock, interval); + } + if (stopFlag_) { + break; + } + now = system_clock::now(); + if (now > next_config_load_time) { + updateBaseConfig(); + next_config_load_time = now + configUpdateIntervalSecs_; + } + if (onDemandSignal_.exchange(false)) { + onDemandConfig = config_->clone(); + configureFromSignal(now, *onDemandConfig); + } else if (now > next_on_demand_load_time) { + onDemandConfig = std::make_unique(); + configureFromDaemon(now, *onDemandConfig); + next_on_demand_load_time = now + onDemandConfigUpdateIntervalSecs_; + } + if (onDemandConfig->verboseLogLevel() >= 0) { + LOG(INFO) << "Setting verbose level to " + << onDemandConfig->verboseLogLevel() + << " from on-demand config"; + SET_LOG_VERBOSITY_LEVEL( + onDemandConfig->verboseLogLevel(), + onDemandConfig->verboseLogModules()); + } + } +} + +bool ConfigLoader::hasNewConfig(const Config& oldConfig) { + std::lock_guard lock(configLock_); + return config_->timestamp() > oldConfig.timestamp(); +} + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/ConfigLoader.h b/tb_plugins/profiling/libkineto/src/ConfigLoader.h new file mode 100644 index 0000000000000000000000000000000000000000..4ce3468e48db116b2a40d992f000a3af1338e70a --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/ConfigLoader.h @@ -0,0 +1,147 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "Config.h" + +// TODO(T90238193) +// @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude +#include "ILoggerObserver.h" + +namespace libkineto { + class LibkinetoApi; +} + +namespace KINETO_NAMESPACE { + +using namespace libkineto; +class DaemonConfigLoader; + +class ConfigLoader { + public: + + static ConfigLoader& instance(); + + enum ConfigKind { + ActivityProfiler = 0, + EventProfiler, + NumConfigKinds + }; + + struct ConfigHandler { + virtual ~ConfigHandler() {} + virtual bool canAcceptConfig() = 0; + virtual void acceptConfig(const Config& cfg) = 0; + }; + + void addHandler(ConfigKind kind, ConfigHandler* handler) { + std::lock_guard lock(updateThreadMutex_); + handlers_[kind].push_back(handler); + startThread(); + } + + void removeHandler(ConfigKind kind, ConfigHandler* handler) { + std::lock_guard lock(updateThreadMutex_); + auto it = std::find( + handlers_[kind].begin(), handlers_[kind].end(), handler); + if (it != handlers_[kind].end()) { + handlers_[kind].erase(it); + } + } + + void notifyHandlers(const Config& cfg) { + std::lock_guard lock(updateThreadMutex_); + for (auto& key_val : handlers_) { + for (ConfigHandler* handler : key_val.second) { + handler->acceptConfig(cfg); + } + } + } + + bool canHandlerAcceptConfig(ConfigKind kind) { + std::lock_guard lock(updateThreadMutex_); + for (ConfigHandler* handler : handlers_[kind]) { + if (!handler->canAcceptConfig()) { + return false; + } + } + return true; + } + + void initBaseConfig() { + bool init = false; + { + std::lock_guard lock(configLock_); + init = !config_ || config_->source().empty(); + } + if (init) { + updateBaseConfig(); + } + } + + inline std::unique_ptr getConfigCopy() { + std::lock_guard lock(configLock_); + return config_->clone(); + } + + bool hasNewConfig(const Config& oldConfig); + int contextCountForGpu(uint32_t gpu); + + void handleOnDemandSignal(); + + static void setDaemonConfigLoaderFactory( + std::function()> factory); + + private: + ConfigLoader(); + ~ConfigLoader(); + + const char* configFileName(); + DaemonConfigLoader* daemonConfigLoader(); + + void startThread(); + void updateConfigThread(); + void updateBaseConfig(); + + // Create configuration when receiving SIGUSR2 + void configureFromSignal( + std::chrono::time_point now, + Config& config); + + // Create configuration when receiving request from a daemon + void configureFromDaemon( + std::chrono::time_point now, + Config& config); + + std::string readOnDemandConfigFromDaemon( + std::chrono::time_point now); + + std::mutex configLock_; + std::atomic configFileName_{nullptr}; + std::unique_ptr config_; + std::unique_ptr daemonConfigLoader_; + std::map> handlers_; + + std::chrono::seconds configUpdateIntervalSecs_; + std::chrono::seconds onDemandConfigUpdateIntervalSecs_; + std::unique_ptr updateThread_; + std::condition_variable updateThreadCondVar_; + std::mutex updateThreadMutex_; + std::atomic_bool stopFlag_{false}; + std::atomic_bool onDemandSignal_{false}; + +#if !USE_GOOGLE_LOG + std::unique_ptr> loggerObservers_; + std::mutex loggerObserversMutex_; +#endif // !USE_GOOGLE_LOG +}; + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/CudaDeviceProperties.cpp b/tb_plugins/profiling/libkineto/src/CudaDeviceProperties.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1e909d5f9cfda13b95cc4abab547d964fe47b48a --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/CudaDeviceProperties.cpp @@ -0,0 +1,130 @@ +/* + * Copyright (c) Kineto Contributors + * All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "CudaDeviceProperties.h" + +#include +#include + +#include +#include + +#include "Logger.h" + +namespace KINETO_NAMESPACE { + +static const std::vector createDeviceProps() { + std::vector props; + int device_count; + cudaError_t error_id = cudaGetDeviceCount(&device_count); + // Return empty vector if error. + if (error_id != cudaSuccess) { + LOG(ERROR) << "cudaGetDeviceCount failed with code " << error_id; + return {}; + } + VLOG(0) << "Device count is " << device_count; + for (size_t i = 0; i < device_count; ++i) { + cudaDeviceProp prop; + error_id = cudaGetDeviceProperties(&prop, i); + // Return empty vector if any device property fail to get. + if (error_id != cudaSuccess) { + LOG(ERROR) << "cudaGetDeviceProperties failed with " << error_id; + return {}; + } + props.push_back(prop); + LOGGER_OBSERVER_ADD_DEVICE(i); + } + return props; +} + +static const std::vector& deviceProps() { + static const std::vector props = createDeviceProps(); + return props; +} + +static const std::string createDevicePropertiesJson( + size_t id, const cudaDeviceProp& props) { + return fmt::format(R"JSON( + {{ + "id": {}, "name": "{}", "totalGlobalMem": {}, + "computeMajor": {}, "computeMinor": {}, + "maxThreadsPerBlock": {}, "maxThreadsPerMultiprocessor": {}, + "regsPerBlock": {}, "regsPerMultiprocessor": {}, "warpSize": {}, + "sharedMemPerBlock": {}, "sharedMemPerMultiprocessor": {}, + "numSms": {}, "sharedMemPerBlockOptin": {} + }})JSON", + id, props.name, props.totalGlobalMem, + props.major, props.minor, + props.maxThreadsPerBlock, props.maxThreadsPerMultiProcessor, + props.regsPerBlock, props.regsPerMultiprocessor, props.warpSize, + props.sharedMemPerBlock, props.sharedMemPerMultiprocessor, + props.multiProcessorCount, props.sharedMemPerBlockOptin); +} + +static const std::string createDevicePropertiesJson() { + std::vector jsonProps; + const auto& props = deviceProps(); + for (size_t i = 0; i < props.size(); i++) { + jsonProps.push_back(createDevicePropertiesJson(i, props[i])); + } + return fmt::format("{}", fmt::join(jsonProps, ",")); +} + +const std::string& devicePropertiesJson() { + static std::string devicePropsJson = createDevicePropertiesJson(); + return devicePropsJson; +} + +int smCount(uint32_t deviceId) { + const std::vector &props = deviceProps(); + return deviceId >= props.size() ? 0 : + props[deviceId].multiProcessorCount; +} + +float kernelOccupancy( + uint32_t deviceId, + uint16_t registersPerThread, + int32_t staticSharedMemory, + int32_t dynamicSharedMemory, + int32_t blockX, + int32_t blockY, + int32_t blockZ, + float blocksPerSm) { + // Calculate occupancy + float occupancy = -1.0; + const std::vector &props = deviceProps(); + if (deviceId < props.size()) { + cudaOccFuncAttributes occFuncAttr; + occFuncAttr.maxThreadsPerBlock = INT_MAX; + occFuncAttr.numRegs = registersPerThread; + occFuncAttr.sharedSizeBytes = staticSharedMemory; + occFuncAttr.partitionedGCConfig = PARTITIONED_GC_OFF; + occFuncAttr.shmemLimitConfig = FUNC_SHMEM_LIMIT_DEFAULT; + occFuncAttr.maxDynamicSharedSizeBytes = 0; + const cudaOccDeviceState occDeviceState = {}; + int blockSize = blockX * blockY * blockZ; + size_t dynamicSmemSize = dynamicSharedMemory; + cudaOccResult occ_result; + cudaOccDeviceProp prop(props[deviceId]); + cudaOccError status = cudaOccMaxActiveBlocksPerMultiprocessor( + &occ_result, &prop, &occFuncAttr, &occDeviceState, + blockSize, dynamicSmemSize); + if (status == CUDA_OCC_SUCCESS) { + if (occ_result.activeBlocksPerMultiprocessor < blocksPerSm) { + blocksPerSm = occ_result.activeBlocksPerMultiprocessor; + } + occupancy = blocksPerSm * blockSize / + (float) props[deviceId].maxThreadsPerMultiProcessor; + } else { + LOG_EVERY_N(ERROR, 1000) << "Failed to calculate occupancy, status = " + << status; + } + } + return occupancy; +} + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/CudaDeviceProperties.h b/tb_plugins/profiling/libkineto/src/CudaDeviceProperties.h new file mode 100644 index 0000000000000000000000000000000000000000..b731fde0c2aab4c9bd3e97f475d204dad02986e7 --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/CudaDeviceProperties.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) Kineto Contributors + * All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +namespace KINETO_NAMESPACE { + +int smCount(uint32_t deviceId); + +// Return estimated achieved occupancy for a kernel +float kernelOccupancy( + uint32_t deviceId, + uint16_t registersPerThread, + int32_t staticSharedMemory, + int32_t dynamicSharedMemory, + int32_t blockX, + int32_t blockY, + int32_t blockZ, + float blocks_per_sm); + +// Return compute properties for each device as a json string +const std::string& devicePropertiesJson(); + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/CuptiActivity.h b/tb_plugins/profiling/libkineto/src/CuptiActivity.h new file mode 100644 index 0000000000000000000000000000000000000000..09c29504060ecbbac609aa2d021ff643f45c143e --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/CuptiActivity.h @@ -0,0 +1,114 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#include + +#include "ITraceActivity.h" +#include "CuptiActivityPlatform.h" +#include "ThreadUtil.h" +#include "cupti_strings.h" + +namespace libkineto { + class ActivityLogger; +} + +namespace KINETO_NAMESPACE { + +using namespace libkineto; +struct TraceSpan; + +// These classes wrap the various CUPTI activity types +// into subclasses of ITraceActivity so that they can all be accessed +// using the ITraceActivity interface and logged via ActivityLogger. + +// Abstract base class, templated on Cupti activity type +template +struct CuptiActivity : public ITraceActivity { + explicit CuptiActivity(const T* activity, const ITraceActivity* linked) + : activity_(*activity), linked_(linked) {} + int64_t timestamp() const override { + return nsToUs(unixEpochTimestamp(activity_.start)); + } + int64_t duration() const override { + return nsToUs(activity_.end - activity_.start); + } + // TODO(T107507796): Deprecate ITraceActivity + int64_t correlationId() const override {return 0;} + int32_t getThreadId() const override {return 0;} + const ITraceActivity* linkedActivity() const override {return linked_;} + int flowType() const override {return kLinkAsyncCpuGpu;} + int flowId() const override {return correlationId();} + const T& raw() const {return activity_;} + const TraceSpan* traceSpan() const override {return nullptr;} + + protected: + const T& activity_; + const ITraceActivity* linked_{nullptr}; +}; + +// CUpti_ActivityAPI - CUDA runtime activities +struct RuntimeActivity : public CuptiActivity { + explicit RuntimeActivity( + const CUpti_ActivityAPI* activity, + const ITraceActivity* linked, + int32_t threadId) + : CuptiActivity(activity, linked), threadId_(threadId) {} + int64_t correlationId() const override {return activity_.correlationId;} + int64_t deviceId() const override {return processId();} + int64_t resourceId() const override {return threadId_;} + ActivityType type() const override {return ActivityType::CUDA_RUNTIME;} + bool flowStart() const override; + const std::string name() const override {return runtimeCbidName(activity_.cbid);} + void log(ActivityLogger& logger) const override; + const std::string metadataJson() const override; + + private: + const int32_t threadId_; +}; + +// CUpti_ActivityAPI - CUDA runtime activities +struct OverheadActivity : public CuptiActivity { + explicit OverheadActivity( + const CUpti_ActivityOverhead* activity, + const ITraceActivity* linked, + int32_t threadId=0) + : CuptiActivity(activity, linked), threadId_(threadId) {} + + int64_t timestamp() const override { + return nsToUs(unixEpochTimestamp(activity_.start)); + } + int64_t duration() const override { + return nsToUs(activity_.end - activity_.start); + } + // TODO: Update this with PID ordering + int64_t deviceId() const override {return -1;} + int64_t resourceId() const override {return threadId_;} + ActivityType type() const override {return ActivityType::OVERHEAD;} + bool flowStart() const override; + const std::string name() const override {return overheadKindString(activity_.overheadKind);} + void log(ActivityLogger& logger) const override; + const std::string metadataJson() const override; + + private: + const int32_t threadId_; +}; + +// Base class for GPU activities. +// Can also be instantiated directly. +template +struct GpuActivity : public CuptiActivity { + explicit GpuActivity(const T* activity, const ITraceActivity* linked) + : CuptiActivity(activity, linked) {} + int64_t correlationId() const override {return raw().correlationId;} + int64_t deviceId() const override {return raw().deviceId;} + int64_t resourceId() const override {return raw().streamId;} + ActivityType type() const override; + bool flowStart() const override {return false;} + const std::string name() const override; + void log(ActivityLogger& logger) const override; + const std::string metadataJson() const override; + const T& raw() const {return CuptiActivity::raw();} +}; + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/CuptiActivity.tpp b/tb_plugins/profiling/libkineto/src/CuptiActivity.tpp new file mode 100644 index 0000000000000000000000000000000000000000..1ff2dafe06b0016ce7b904ef4b55e047c69bcc1c --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/CuptiActivity.tpp @@ -0,0 +1,111 @@ + /* + * Copyright (c) Facebook, Inc. and its affiliates. + * All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "CuptiActivity.h" + +#include + +#include "Demangle.h" +#include "output_base.h" + +namespace KINETO_NAMESPACE { + +using namespace libkineto; + +template<> +inline const std::string GpuActivity::name() const { + return demangle(raw().name); +} + +template<> +inline ActivityType GpuActivity::type() const { + return ActivityType::CONCURRENT_KERNEL; +} + +static inline std::string memcpyName(uint8_t kind, uint8_t src, uint8_t dst) { + return fmt::format( + "Memcpy {} ({} -> {})", + memcpyKindString((CUpti_ActivityMemcpyKind)kind), + memoryKindString((CUpti_ActivityMemoryKind)src), + memoryKindString((CUpti_ActivityMemoryKind)dst)); +} + +template<> +inline ActivityType GpuActivity::type() const { + return ActivityType::GPU_MEMCPY; +} + +template<> +inline const std::string GpuActivity::name() const { + return memcpyName(raw().copyKind, raw().srcKind, raw().dstKind); +} + +template<> +inline ActivityType GpuActivity::type() const { + return ActivityType::GPU_MEMCPY; +} + +template<> +inline const std::string GpuActivity::name() const { + return memcpyName(raw().copyKind, raw().srcKind, raw().dstKind); +} + +template<> +inline const std::string GpuActivity::name() const { + const char* memory_kind = + memoryKindString((CUpti_ActivityMemoryKind)raw().memoryKind); + return fmt::format("Memset ({})", memory_kind); +} + +template<> +inline ActivityType GpuActivity::type() const { + return ActivityType::GPU_MEMSET; +} + +inline void RuntimeActivity::log(ActivityLogger& logger) const { + logger.handleActivity(*this); +} + +inline void OverheadActivity::log(ActivityLogger& logger) const { + logger.handleActivity(*this); +} + +inline bool OverheadActivity::flowStart() const { + return false; +} + +inline const std::string OverheadActivity::metadataJson() const { + return ""; +} + +template +inline void GpuActivity::log(ActivityLogger& logger) const { + logger.handleGpuActivity(*this); +} + +inline bool RuntimeActivity::flowStart() const { + return activity_.cbid == CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000 || + (activity_.cbid >= CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020 && + activity_.cbid <= CUPTI_RUNTIME_TRACE_CBID_cudaMemset2DAsync_v3020) || + activity_.cbid == + CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernel_v9000 || + activity_.cbid == + CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernelMultiDevice_v9000; +} + +inline const std::string RuntimeActivity::metadataJson() const { + return fmt::format(R"JSON( + "cbid": {}, "correlation": {})JSON", + activity_.cbid, activity_.correlationId); +} + +template +inline const std::string GpuActivity::metadataJson() const { + return ""; +} + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/CuptiActivityApi.cpp b/tb_plugins/profiling/libkineto/src/CuptiActivityApi.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5718bed2f89b06cc702d1b82976cd42e5fceebd0 --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/CuptiActivityApi.cpp @@ -0,0 +1,343 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include "CuptiActivityApi.h" + +#include +#include + +#include "cupti_call.h" +#include "Logger.h" + +using namespace std::chrono; + +namespace KINETO_NAMESPACE { + +// TODO: do we want this to be configurable? +// Set to 2MB to avoid constantly creating buffers (espeically for networks +// that has many small memcpy such as sparseNN) +// Consider putting this on huge pages? +constexpr size_t kBufSize(2 * 1024 * 1024); + +CuptiActivityApi& CuptiActivityApi::singleton() { + static CuptiActivityApi instance; + return instance; +} + +void CuptiActivityApi::pushCorrelationID(int id, CorrelationFlowType type) { +#ifdef HAS_CUPTI + if (!singleton().externalCorrelationEnabled_) { + return; + } + VLOG(2) << "pushCorrelationID(" << id << ")"; + switch(type) { + case Default: + CUPTI_CALL(cuptiActivityPushExternalCorrelationId( + CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM0, id)); + break; + case User: + CUPTI_CALL(cuptiActivityPushExternalCorrelationId( + CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM1, id)); + } +#endif +} + +void CuptiActivityApi::popCorrelationID(CorrelationFlowType type) { +#ifdef HAS_CUPTI + if (!singleton().externalCorrelationEnabled_) { + return; + } + switch(type) { + case Default: + CUPTI_CALL(cuptiActivityPopExternalCorrelationId( + CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM0, nullptr)); + break; + case User: + CUPTI_CALL(cuptiActivityPopExternalCorrelationId( + CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM1, nullptr)); + } +#endif +} + +static int getSMCount() { +#ifdef HAS_CUPTI + // There may be a simpler way to get the number of SMs.... + // Look for domain_d - this has 80 instances on Volta and + // 56 instances on Pascal, corresponding to the number of SMs + // FIXME: This does not work on Turing and later + uint32_t domainCount{0}; + CUPTI_CALL(cuptiDeviceGetNumEventDomains(0, &domainCount)); + std::vector ids(domainCount); + size_t sz = sizeof(CUpti_EventDomainID) * domainCount; + CUPTI_CALL(cuptiDeviceEnumEventDomains(0, &sz, ids.data())); + for (CUpti_EventDomainID id : ids) { + char name[16]; + name[0] = '\0'; + sz = sizeof(name); + CUPTI_CALL(cuptiEventDomainGetAttribute( + id, CUPTI_EVENT_DOMAIN_ATTR_NAME, &sz, name)); + if (strncmp(name, "domain_d", sz) == 0) { + uint32_t count{0}; + sz = sizeof(count); + CUPTI_CALL(cuptiDeviceGetEventDomainAttribute( + 0, id, CUPTI_EVENT_DOMAIN_ATTR_TOTAL_INSTANCE_COUNT, &sz, &count)); + return count; + } + } +#endif + + return -1; +} + +int CuptiActivityApi::smCount() { + static int sm_count = getSMCount(); + return sm_count; +} + +static bool nextActivityRecord( + uint8_t* buffer, + size_t valid_size, + CUpti_Activity*& record) { +#ifdef HAS_CUPTI + CUptiResult status = CUPTI_CALL_NOWARN( + cuptiActivityGetNextRecord(buffer, valid_size, &record)); + if (status != CUPTI_SUCCESS) { + if (status != CUPTI_ERROR_MAX_LIMIT_REACHED) { + CUPTI_CALL(status); + } + record = nullptr; + } +#endif + return record != nullptr; +} + +void CuptiActivityApi::setMaxBufferSize(int size) { + maxGpuBufferCount_ = 1 + size / kBufSize; +} + +void CuptiActivityApi::forceLoadCupti() { +#ifdef HAS_CUPTI + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL)); +#endif +} + +#ifdef HAS_CUPTI +void CUPTIAPI CuptiActivityApi::bufferRequestedTrampoline( + uint8_t** buffer, + size_t* size, + size_t* maxNumRecords) { + singleton().bufferRequested(buffer, size, maxNumRecords); +} + +void CuptiActivityApi::bufferRequested( + uint8_t** buffer, size_t* size, size_t* maxNumRecords) { + std::lock_guard guard(mutex_); + if (allocatedGpuTraceBuffers_.size() >= maxGpuBufferCount_) { + stopCollection = true; + LOG(WARNING) << "Exceeded max GPU buffer count (" + << allocatedGpuTraceBuffers_.size() + << " > " << maxGpuBufferCount_ + << ") - terminating tracing"; + } + + auto buf = std::make_unique(kBufSize); + *buffer = buf->data(); + *size = kBufSize; + + allocatedGpuTraceBuffers_[*buffer] = std::move(buf); + + *maxNumRecords = 0; +} +#endif + +std::unique_ptr +CuptiActivityApi::activityBuffers() { + { + std::lock_guard guard(mutex_); + if (allocatedGpuTraceBuffers_.empty()) { + return nullptr; + } + } + +#ifdef HAS_CUPTI + VLOG(1) << "Flushing GPU activity buffers"; + time_point t1; + if (VLOG_IS_ON(1)) { + t1 = system_clock::now(); + } + // Can't hold mutex_ during this call, since bufferCompleted + // will be called by libcupti and mutex_ is acquired there. + CUPTI_CALL(cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED)); + if (VLOG_IS_ON(1)) { + flushOverhead = + duration_cast(system_clock::now() - t1).count(); + } +#endif + std::lock_guard guard(mutex_); + // Transfer ownership of buffers to caller. A new map is created on-demand. + return std::move(readyGpuTraceBuffers_); +} + +#ifdef HAS_CUPTI +int CuptiActivityApi::processActivitiesForBuffer( + uint8_t* buf, + size_t validSize, + std::function handler) { + int count = 0; + if (buf && validSize) { + CUpti_Activity* record{nullptr}; + while ((nextActivityRecord(buf, validSize, record))) { + handler(record); + ++count; + } + } + return count; +} +#endif + +const std::pair CuptiActivityApi::processActivities( + CuptiActivityBufferMap& buffers, + std::function handler) { + std::pair res{0, 0}; +#ifdef HAS_CUPTI + for (auto& pair : buffers) { + // No lock needed - only accessed from this thread + auto& buf = pair.second; + res.first += processActivitiesForBuffer(buf->data(), buf->size(), handler); + res.second += buf->size(); + } +#endif + return res; +} + +void CuptiActivityApi::clearActivities() { + { + std::lock_guard guard(mutex_); + if (allocatedGpuTraceBuffers_.empty()) { + return; + } + } + // Can't hold mutex_ during this call, since bufferCompleted + // will be called by libcupti and mutex_ is acquired there. +#ifdef HAS_CUPTI + CUPTI_CALL(cuptiActivityFlushAll(0)); +#endif + // FIXME: We might want to make sure we reuse + // the same memory during warmup and tracing. + // Also, try to use the amount of memory required + // for active tracing during warmup. + std::lock_guard guard(mutex_); + // Throw away ready buffers as a result of above flush + readyGpuTraceBuffers_ = nullptr; +} + +#ifdef HAS_CUPTI +void CUPTIAPI CuptiActivityApi::bufferCompletedTrampoline( + CUcontext ctx, + uint32_t streamId, + uint8_t* buffer, + size_t /* unused */, + size_t validSize) { + singleton().bufferCompleted(ctx, streamId, buffer, 0, validSize); +} + +void CuptiActivityApi::bufferCompleted( + CUcontext ctx, + uint32_t streamId, + uint8_t* buffer, + size_t /* unused */, + size_t validSize) { + + std::lock_guard guard(mutex_); + auto it = allocatedGpuTraceBuffers_.find(buffer); + if (it == allocatedGpuTraceBuffers_.end()) { + LOG(ERROR) << "bufferCompleted called with unknown buffer: " + << (void*) buffer; + return; + } + + if (!readyGpuTraceBuffers_) { + readyGpuTraceBuffers_ = std::make_unique(); + } + // Set valid size of buffer before moving to ready map + it->second->setSize(validSize); + (*readyGpuTraceBuffers_)[it->first] = std::move(it->second); + allocatedGpuTraceBuffers_.erase(it); + + // report any records dropped from the queue; to avoid unnecessary cupti + // API calls, we make it report only in verbose mode (it doesn't happen + // often in our testing anyways) + if (VLOG_IS_ON(1)) { + size_t dropped = 0; + CUPTI_CALL(cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped)); + if (dropped != 0) { + LOG(WARNING) << "Dropped " << dropped << " activity records"; + } + } +} +#endif + +void CuptiActivityApi::enableCuptiActivities( + const std::set& selected_activities) { +#ifdef HAS_CUPTI + static bool registered = false; + if (!registered) { + CUPTI_CALL( + cuptiActivityRegisterCallbacks(bufferRequestedTrampoline, bufferCompletedTrampoline)); + } + + externalCorrelationEnabled_ = false; + for (const auto& activity : selected_activities) { + if (activity == ActivityType::GPU_MEMCPY) { + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY)); + } + if (activity == ActivityType::GPU_MEMSET) { + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET)); + } + if (activity == ActivityType::CONCURRENT_KERNEL) { + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL)); + } + if (activity == ActivityType::EXTERNAL_CORRELATION) { + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION)); + externalCorrelationEnabled_ = true; + } + if (activity == ActivityType::CUDA_RUNTIME) { + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME)); + } + if (activity == ActivityType::OVERHEAD) { + CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD)); + } + } +#endif + + // Explicitly enabled, so reset this flag if set + stopCollection = false; +} + +void CuptiActivityApi::disableCuptiActivities( + const std::set& selected_activities) { +#ifdef HAS_CUPTI + for (const auto& activity : selected_activities) { + if (activity == ActivityType::GPU_MEMCPY) { + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMCPY)); + } + if (activity == ActivityType::GPU_MEMSET) { + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMSET)); + } + if (activity == ActivityType::CONCURRENT_KERNEL) { + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL)); + } + if (activity == ActivityType::EXTERNAL_CORRELATION) { + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION)); + } + if (activity == ActivityType::CUDA_RUNTIME) { + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_RUNTIME)); + } + if (activity == ActivityType::OVERHEAD) { + CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OVERHEAD)); + } + } + externalCorrelationEnabled_ = false; +#endif +} + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/CuptiActivityApi.h b/tb_plugins/profiling/libkineto/src/CuptiActivityApi.h new file mode 100644 index 0000000000000000000000000000000000000000..92af51ecac9ec99181c4726c3849894de9e32b33 --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/CuptiActivityApi.h @@ -0,0 +1,100 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#ifdef HAS_CUPTI +#include +#endif + +#include "ActivityType.h" +#include "CuptiActivityBuffer.h" + + +namespace KINETO_NAMESPACE { + +using namespace libkineto; + +#ifndef HAS_CUPTI +using CUpti_Activity = void; +#endif + +class CuptiActivityApi { + public: + enum CorrelationFlowType { + Default, + User + }; + + CuptiActivityApi() = default; + CuptiActivityApi(const CuptiActivityApi&) = delete; + CuptiActivityApi& operator=(const CuptiActivityApi&) = delete; + + virtual ~CuptiActivityApi() {} + + static CuptiActivityApi& singleton(); + + virtual int smCount(); + static void pushCorrelationID(int id, CorrelationFlowType type); + static void popCorrelationID(CorrelationFlowType type); + + void enableCuptiActivities( + const std::set& selected_activities); + void disableCuptiActivities( + const std::set& selected_activities); + void clearActivities(); + + virtual std::unique_ptr activityBuffers(); + + virtual const std::pair processActivities( + CuptiActivityBufferMap&, + std::function handler); + + void setMaxBufferSize(int size); + + std::atomic_bool stopCollection{false}; + int64_t flushOverhead{0}; + + static void forceLoadCupti(); + + private: +#ifdef HAS_CUPTI + int processActivitiesForBuffer( + uint8_t* buf, + size_t validSize, + std::function handler); + static void CUPTIAPI + bufferRequestedTrampoline(uint8_t** buffer, size_t* size, size_t* maxNumRecords); + static void CUPTIAPI bufferCompletedTrampoline( + CUcontext ctx, + uint32_t streamId, + uint8_t* buffer, + size_t /* unused */, + size_t validSize); +#endif // HAS_CUPTI + + int maxGpuBufferCount_{0}; + CuptiActivityBufferMap allocatedGpuTraceBuffers_; + std::unique_ptr readyGpuTraceBuffers_; + std::mutex mutex_; + bool externalCorrelationEnabled_{false}; + + protected: +#ifdef HAS_CUPTI + void bufferRequested(uint8_t** buffer, size_t* size, size_t* maxNumRecords); + void bufferCompleted( + CUcontext ctx, + uint32_t streamId, + uint8_t* buffer, + size_t /* unused */, + size_t validSize); +#endif +}; + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/CuptiActivityBuffer.h b/tb_plugins/profiling/libkineto/src/CuptiActivityBuffer.h new file mode 100644 index 0000000000000000000000000000000000000000..1c3fbef62c8d8f42ff5da1718e20315cc1ba95d5 --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/CuptiActivityBuffer.h @@ -0,0 +1,51 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "ITraceActivity.h" + +namespace KINETO_NAMESPACE { + +class CuptiActivityBuffer { + public: + explicit CuptiActivityBuffer(size_t size) : size_(size) { + buf_.reserve(size); + } + CuptiActivityBuffer() = delete; + CuptiActivityBuffer& operator=(const CuptiActivityBuffer&) = delete; + CuptiActivityBuffer(CuptiActivityBuffer&&) = default; + CuptiActivityBuffer& operator=(CuptiActivityBuffer&&) = default; + + size_t size() const { + return size_; + } + + void setSize(size_t size) { + assert(size <= buf_.capacity()); + size_ = size; + } + + uint8_t* data() { + return buf_.data(); + } + + private: + + std::vector buf_; + size_t size_; + + std::vector> wrappers_; +}; + +using CuptiActivityBufferMap = + std::map>; + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/CuptiActivityPlatform.cpp b/tb_plugins/profiling/libkineto/src/CuptiActivityPlatform.cpp new file mode 100644 index 0000000000000000000000000000000000000000..fa2ef2f3a8c9cbb7f10567c158d6ee3e8e26eed0 --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/CuptiActivityPlatform.cpp @@ -0,0 +1,31 @@ +#include + +namespace chrono = std::chrono; + +namespace KINETO_NAMESPACE { + +#ifdef _WIN32 +uint64_t epochs_diff() { + // On Windows, steady_clock wraps the QueryPerformanceCounter function. + // https://docs.microsoft.com/en-us/cpp/standard-library/steady-clock-struct?view=msvc-160 + auto steady = + chrono::time_point_cast(chrono::steady_clock::now()); + auto system = + chrono::time_point_cast(chrono::system_clock::now()); + + auto time_since_unix = system.time_since_epoch().count(); + auto time_since_boot = steady.time_since_epoch().count(); + return time_since_unix - time_since_boot; +} + +uint64_t unixEpochTimestamp(uint64_t ts) { + static uint64_t diff = epochs_diff(); + return ts + diff; +} +#else +uint64_t unixEpochTimestamp(uint64_t ts) { + return ts; +} +#endif // _WIN32 + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/CuptiActivityPlatform.h b/tb_plugins/profiling/libkineto/src/CuptiActivityPlatform.h new file mode 100644 index 0000000000000000000000000000000000000000..78de8373d5fe391d48edffc897aff6893aa6f54f --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/CuptiActivityPlatform.h @@ -0,0 +1,12 @@ +#pragma once + +#include + +namespace KINETO_NAMESPACE { + +// cupti's timestamps are platform specific. This function convert the raw +// cupti timestamp to time since unix epoch. So that on different platform, +// correction can work correctly. +uint64_t unixEpochTimestamp(uint64_t ts); + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/CuptiActivityProfiler.cpp b/tb_plugins/profiling/libkineto/src/CuptiActivityProfiler.cpp new file mode 100644 index 0000000000000000000000000000000000000000..97c23ef047d75aff75b56773a20801ce83fb1653 --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/CuptiActivityProfiler.cpp @@ -0,0 +1,841 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include "CuptiActivityProfiler.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef HAS_CUPTI +#include +#endif + +#include "Config.h" +#include "time_since_epoch.h" +#ifdef HAS_CUPTI +#include "CuptiActivity.h" +#include "CuptiActivity.tpp" +#include "CuptiActivityApi.h" +#endif // HAS_CUPTI +#ifdef HAS_ROCTRACER +#include "RoctracerActivityApi.h" +#endif +#include "output_base.h" + +#include "Logger.h" +#include "ThreadUtil.h" + +using namespace std::chrono; +using namespace libkineto; +using std::string; + +namespace KINETO_NAMESPACE { + +void CuptiActivityProfiler::transferCpuTrace( + std::unique_ptr cpuTrace) { + std::lock_guard guard(mutex_); + const string& trace_name = cpuTrace->span.name; + if (currentRunloopState_ != RunloopState::CollectTrace && + currentRunloopState_ != RunloopState::ProcessTrace) { + VLOG(0) << "Trace collection not in progress - discarding span " + << trace_name; + return; + } + + cpuTrace->span.iteration = iterationCountMap_[trace_name]++; + + VLOG(0) << "Received iteration " << cpuTrace->span.iteration << " of span " + << trace_name << " (" << cpuTrace->activities.size() << " activities / " + << cpuTrace->gpuOpCount << " gpu activities)"; + traceBuffers_->cpu.push_back(std::move(cpuTrace)); +} + +#ifdef HAS_ROCTRACER +CuptiActivityProfiler::CuptiActivityProfiler(RoctracerActivityApi& cupti, bool cpuOnly) +#else +CuptiActivityProfiler::CuptiActivityProfiler(CuptiActivityApi& cupti, bool cpuOnly) +#endif + : cupti_(cupti), + flushOverhead_{0, 0}, + setupOverhead_{0, 0}, + cpuOnly_{cpuOnly}, + currentRunloopState_{RunloopState::WaitForRequest}, + stopCollection_{false} {} + +void CuptiActivityProfiler::processTraceInternal(ActivityLogger& logger) { + LOG(INFO) << "Processing " << traceBuffers_->cpu.size() + << " CPU buffers"; + VLOG(0) << "Profile time range: " << captureWindowStartTime_ << " - " + << captureWindowEndTime_; + logger.handleTraceStart(metadata_); + for (auto& cpu_trace : traceBuffers_->cpu) { + string trace_name = cpu_trace->span.name; + VLOG(0) << "Processing CPU buffer for " << trace_name << " (" + << cpu_trace->span.iteration << ") - " + << cpu_trace->activities.size() << " records"; + VLOG(0) << "Span time range: " << cpu_trace->span.startTime << " - " + << cpu_trace->span.endTime; + processCpuTrace(*cpu_trace, logger); + LOGGER_OBSERVER_ADD_EVENT_COUNT(cpu_trace->activities.size()); + } + +#ifdef HAS_CUPTI + if (!cpuOnly_) { + VLOG(0) << "Retrieving GPU activity buffers"; + traceBuffers_->gpu = cupti_.activityBuffers(); + if (VLOG_IS_ON(1)) { + addOverheadSample(flushOverhead_, cupti_.flushOverhead); + } + if (traceBuffers_->gpu) { + const auto count_and_size = cupti_.processActivities( + *traceBuffers_->gpu, + std::bind(&CuptiActivityProfiler::handleCuptiActivity, this, std::placeholders::_1, &logger)); + LOG(INFO) << "Processed " << count_and_size.first + << " GPU records (" << count_and_size.second << " bytes)"; + LOGGER_OBSERVER_ADD_EVENT_COUNT(count_and_size.first); + } + } +#endif // HAS_CUPTI +#ifdef HAS_ROCTRACER + if (!cpuOnly_) { + VLOG(0) << "Retrieving GPU activity buffers"; + const int count = cupti_.processActivities(logger); + LOG(INFO) << "Processed " << count + << " GPU records"; + LOGGER_OBSERVER_ADD_EVENT_COUNT(count); + } +#endif // HAS_ROCTRACER + + for (const auto& session : sessions_){ + LOG(INFO) << "Processing child profiler trace"; + session->processTrace(logger); + } + + finalizeTrace(*config_, logger); +} + +CuptiActivityProfiler::CpuGpuSpanPair& CuptiActivityProfiler::recordTraceSpan( + TraceSpan& span, int gpuOpCount) { + TraceSpan gpu_span(gpuOpCount, span.iteration, span.name, "GPU: "); + auto& iterations = traceSpans_[span.name]; + iterations.push_back({span, gpu_span}); + return iterations.back(); +} + +void CuptiActivityProfiler::processCpuTrace( + libkineto::CpuTraceBuffer& cpuTrace, + ActivityLogger& logger) { + if (cpuTrace.activities.size() == 0) { + LOG(WARNING) << "CPU trace is empty!"; + return; + } + + CpuGpuSpanPair& span_pair = recordTraceSpan(cpuTrace.span, cpuTrace.gpuOpCount); + TraceSpan& cpu_span = span_pair.first; + for (auto const& act : cpuTrace.activities) { + VLOG(2) << act.correlationId() << ": OP " << act.activityName; + if (config_->selectedActivityTypes().count(act.type())) { + act.log(logger); + } + clientActivityTraceMap_[act.correlationId()] = &span_pair; + activityMap_[act.correlationId()] = &act; + + recordThreadInfo(act.resourceId(), act.getThreadId(), act.deviceId()); + } + logger.handleTraceSpan(cpu_span); +} + +#ifdef HAS_CUPTI +inline void CuptiActivityProfiler::handleCorrelationActivity( + const CUpti_ActivityExternalCorrelation* correlation) { + if (correlation->externalKind == CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM0) { + cpuCorrelationMap_[correlation->correlationId] = correlation->externalId; + } else if (correlation->externalKind == CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM1){ + userCorrelationMap_[correlation->correlationId] = correlation->externalId; + } else { + LOG(ERROR) << "Invalid CUpti_ActivityExternalCorrelation sent to handleCuptiActivity"; + } +} +#endif // HAS_CUPTI + +static GenericTraceActivity createUserGpuSpan( + const libkineto::ITraceActivity& cpuTraceActivity, + const libkineto::ITraceActivity& gpuTraceActivity) { + GenericTraceActivity res( + *cpuTraceActivity.traceSpan(), + ActivityType::GPU_USER_ANNOTATION, + cpuTraceActivity.name()); + res.startTime = gpuTraceActivity.timestamp(); + res.device = gpuTraceActivity.deviceId(); + res.resource = gpuTraceActivity.resourceId(); + res.endTime = + gpuTraceActivity.timestamp() + gpuTraceActivity.duration(); + res.id = cpuTraceActivity.correlationId(); + return res; +} + +void CuptiActivityProfiler::GpuUserEventMap::insertOrExtendEvent( + const ITraceActivity& userActivity, + const ITraceActivity& gpuActivity) { + StreamKey key(gpuActivity.deviceId(), gpuActivity.resourceId()); + CorrelationSpanMap& correlationSpanMap = streamSpanMap_[key]; + auto it = correlationSpanMap.find(userActivity.correlationId()); + if (it == correlationSpanMap.end()) { + auto it_success = correlationSpanMap.insert({ + userActivity.correlationId(), createUserGpuSpan(userActivity, gpuActivity) + }); + it = it_success.first; + } + GenericTraceActivity& span = it->second; + if (gpuActivity.timestamp() < span.startTime || span.startTime == 0) { + span.startTime = gpuActivity.timestamp(); + } + int64_t gpu_activity_end = gpuActivity.timestamp() + gpuActivity.duration(); + if (gpu_activity_end > span.endTime) { + span.endTime = gpu_activity_end; + } +} + +const CuptiActivityProfiler::CpuGpuSpanPair& CuptiActivityProfiler::defaultTraceSpan() { + static TraceSpan span(0, 0, "Unknown", ""); + static CpuGpuSpanPair span_pair(span, span); + return span_pair; +} + +void CuptiActivityProfiler::GpuUserEventMap::logEvents(ActivityLogger *logger) { + for (auto const& streamMapPair : streamSpanMap_) { + for (auto const& correlationSpanPair : streamMapPair.second) { + correlationSpanPair.second.log(*logger); + } + } +} + +#ifdef HAS_CUPTI +inline bool CuptiActivityProfiler::outOfRange(const ITraceActivity& act) { + bool out_of_range = act.timestamp() < captureWindowStartTime_ || + (act.timestamp() + act.duration()) > captureWindowEndTime_; + if (out_of_range) { + VLOG(2) << "TraceActivity outside of profiling window: " << act.name() + << " (" << act.timestamp() << " < " << captureWindowStartTime_ << " or " + << (act.timestamp() + act.duration()) << " > " << captureWindowEndTime_; + } + return out_of_range; +} + +inline static bool isBlockListedRuntimeCbid(CUpti_CallbackId cbid) { + // Some CUDA calls that are very frequent and also not very interesting. + // Filter these out to reduce trace size. + if (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaGetDevice_v3020 || + cbid == CUPTI_RUNTIME_TRACE_CBID_cudaSetDevice_v3020 || + cbid == CUPTI_RUNTIME_TRACE_CBID_cudaGetLastError_v3020 || + // Don't care about cudaEvents + cbid == CUPTI_RUNTIME_TRACE_CBID_cudaEventCreate_v3020 || + cbid == CUPTI_RUNTIME_TRACE_CBID_cudaEventCreateWithFlags_v3020 || + cbid == CUPTI_RUNTIME_TRACE_CBID_cudaEventRecord_v3020 || + cbid == CUPTI_RUNTIME_TRACE_CBID_cudaEventDestroy_v3020 || + cbid == CUPTI_RUNTIME_TRACE_CBID_cudaEventSynchronize_v3020) { + return true; + } + + return false; +} + +void CuptiActivityProfiler::handleRuntimeActivity( + const CUpti_ActivityAPI* activity, + ActivityLogger* logger) { + if (isBlockListedRuntimeCbid(activity->cbid)) { + return; + } + VLOG(2) << activity->correlationId + << ": CUPTI_ACTIVITY_KIND_RUNTIME, cbid=" << activity->cbid + << " tid=" << activity->threadId; + int32_t tid = activity->threadId; + const auto& it = resourceInfo_.find({processId(), tid}); + if (it != resourceInfo_.end()) { + tid = it->second.id; + } + const ITraceActivity* linked = linkedActivity( + activity->correlationId, cpuCorrelationMap_); + const auto& runtime_activity = + traceBuffers_->addActivityWrapper(RuntimeActivity(activity, linked, tid)); + checkTimestampOrder(&runtime_activity); + if (outOfRange(runtime_activity)) { + return; + } + runtime_activity.log(*logger); +} + +void CuptiActivityProfiler::handleOverheadActivity( + const CUpti_ActivityOverhead* activity, + ActivityLogger* logger) { + VLOG(2) << ": CUPTI_ACTIVITY_KIND_OVERHEAD" << " overheadKind=" << activity->overheadKind; + + const auto& overhead_activity = + traceBuffers_->addActivityWrapper(OverheadActivity(activity, nullptr)); + overhead_activity.log(*logger); +} + + +inline void CuptiActivityProfiler::updateGpuNetSpan( + const ITraceActivity& gpuOp) { + if (!gpuOp.linkedActivity()) { + VLOG(0) << "Missing linked activity"; + return; + } + const auto& it = clientActivityTraceMap_.find( + gpuOp.linkedActivity()->correlationId()); + if (it == clientActivityTraceMap_.end()) { + // No correlation id mapping? + return; + } + TraceSpan& gpu_span = it->second->second; + if (gpuOp.timestamp() < gpu_span.startTime || gpu_span.startTime == 0) { + gpu_span.startTime = gpuOp.timestamp(); + } + if ((gpuOp.timestamp() + gpuOp.duration()) > gpu_span.endTime) { + gpu_span.endTime = gpuOp.timestamp() + gpuOp.duration(); + } +} + +// I've observed occasional broken timestamps attached to GPU events... +void CuptiActivityProfiler::checkTimestampOrder(const ITraceActivity* act1) { + // Correlated GPU runtime activity cannot + // have timestamp greater than the GPU activity's + const auto& it = correlatedCudaActivities_.find(act1->correlationId()); + if (it == correlatedCudaActivities_.end()) { + correlatedCudaActivities_.insert({act1->correlationId(), act1}); + return; + } + + // Activities may be appear in the buffers out of order. + // If we have a runtime activity in the map, it should mean that we + // have a GPU activity passed in, and vice versa. + const ITraceActivity* act2 = it->second; + if (act2->type() == ActivityType::CUDA_RUNTIME) { + // Buffer is out-of-order. + // Swap so that runtime activity is first for the comparison below. + std::swap(act1, act2); + } + if (act1->timestamp() > act2->timestamp()) { + LOG(WARNING) << "GPU op timestamp (" << act2->timestamp() + << ") < runtime timestamp (" << act1->timestamp() << ") by " + << act1->timestamp() - act2->timestamp() << "us"; + LOG(WARNING) << "Name: " << act2->name() + << " Device: " << act2->deviceId() + << " Stream: " << act2->resourceId(); + } +} + +inline void CuptiActivityProfiler::handleGpuActivity( + const ITraceActivity& act, + ActivityLogger* logger) { + if (outOfRange(act)) { + return; + } + checkTimestampOrder(&act); + VLOG(2) << act.correlationId() << ": " + << act.name(); + recordStream(act.deviceId(), act.resourceId(), ""); + act.log(*logger); + updateGpuNetSpan(act); + if (config_->selectedActivityTypes().count(ActivityType::GPU_USER_ANNOTATION)) { + const auto& it = userCorrelationMap_.find(act.correlationId()); + if (it != userCorrelationMap_.end()) { + const auto& it2 = activityMap_.find(it->second); + if (it2 != activityMap_.end()) { + recordStream(act.deviceId(), act.resourceId(), "context"); + gpuUserEventMap_.insertOrExtendEvent(*it2->second, act); + } + } + } +} + +const ITraceActivity* CuptiActivityProfiler::linkedActivity( + int32_t correlationId, + const std::unordered_map& correlationMap) { + const auto& it = correlationMap.find(correlationId); + if (it != correlationMap.end()) { + const auto& it2 = activityMap_.find(it->second); + if (it2 != activityMap_.end()) { + return it2->second; + } + } + return nullptr; +} + +template +inline void CuptiActivityProfiler::handleGpuActivity( + const T* act, ActivityLogger* logger) { + const ITraceActivity* linked = linkedActivity( + act->correlationId, cpuCorrelationMap_); + const auto& gpu_activity = + traceBuffers_->addActivityWrapper(GpuActivity(act, linked)); + handleGpuActivity(gpu_activity, logger); +} + +void CuptiActivityProfiler::handleCuptiActivity(const CUpti_Activity* record, ActivityLogger* logger) { + switch (record->kind) { + case CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION: + handleCorrelationActivity( + reinterpret_cast( + record)); + break; + case CUPTI_ACTIVITY_KIND_RUNTIME: + handleRuntimeActivity( + reinterpret_cast(record), logger); + break; + case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: + handleGpuActivity( + reinterpret_cast(record), logger); + break; + case CUPTI_ACTIVITY_KIND_MEMCPY: + handleGpuActivity( + reinterpret_cast(record), logger); + break; + case CUPTI_ACTIVITY_KIND_MEMCPY2: + handleGpuActivity( + reinterpret_cast(record), logger); + break; + case CUPTI_ACTIVITY_KIND_MEMSET: + handleGpuActivity( + reinterpret_cast(record), logger); + break; + case CUPTI_ACTIVITY_KIND_OVERHEAD: + handleOverheadActivity (reinterpret_cast(record), logger); + break; + default: + LOG(WARNING) << "Unexpected activity type: " << record->kind; + break; + } +} +#endif // HAS_CUPTI + +void CuptiActivityProfiler::configureChildProfilers() { + // If child profilers are enabled create profiler sessions + for (auto& profiler: profilers_) { + int64_t start_time_ms = duration_cast( + profileStartTime_.time_since_epoch()).count(); + LOG(INFO) << "Running child profiler " << profiler->name() << " for " + << config_->activitiesDuration().count() << " ms"; + auto session = profiler->configure( + start_time_ms, + config_->activitiesDuration().count(), + config_->selectedActivityTypes(), + *config_ + ); + if (session) { + sessions_.push_back(std::move(session)); + } + } +} + +void CuptiActivityProfiler::configure( + const Config& config, + const time_point& now) { + std::lock_guard guard(mutex_); + if (isActive()) { + LOG(ERROR) << "CuptiActivityProfiler already busy, terminating"; + return; + } + + config_ = config.clone(); + + if (config_->activitiesDuration().count() == 0) { + // Use default if not specified + config_->setActivitiesDuration( + config_->activitiesDurationDefault()); + } + + // Ensure we're starting in a clean state + resetTraceData(); + +#if !USE_GOOGLE_LOG + // Add a LoggerObserverCollector to collect all logs during the trace. + loggerCollectorMetadata_ = std::make_unique(); + Logger::addLoggerObserver(loggerCollectorMetadata_.get()); +#endif // !USE_GOOGLE_LOG + + profileStartTime_ = config_->requestTimestamp(); + + if (config_->hasProfileStartIteration()) { + profileStartIter_ = config_->profileStartIteration(); + profileEndIter_ = profileStartIter_ + config_->activitiesRunIterations(); + } else { + + profileStartIter_ = -1; + profileEndIter_ = (std::numeric_limits::max)(); + + if (profileStartTime_ < now) { + LOG(ERROR) << "Not starting tracing - start timestamp is in the past. Time difference (ms): " << duration_cast(now - profileStartTime_).count(); + return; + } else if ((profileStartTime_ - now) < config_->activitiesWarmupDuration()) { + LOG(ERROR) << "Not starting tracing - insufficient time for warmup. Time to warmup (ms): " << duration_cast(profileStartTime_ - now).count() ; + return; + } + } + + if (LOG_IS_ON(INFO)) { + config_->printActivityProfilerConfig(LIBKINETO_DBG_STREAM); + } + if (!cpuOnly_ && !libkineto::api().client()) { + if (profileStartIter_ < 0) { + LOG(INFO) << "GPU-only tracing for " + << config_->activitiesDuration().count() << "ms"; + } else { + LOG(INFO) << "GPU-only tracing for " + << config_->activitiesRunIterations() << " iterations"; + } + } + + // Set useful metadata into the logger. + LOGGER_OBSERVER_SET_TRACE_DURATION_MS(config_->activitiesDuration().count()); + if (!config_->requestTraceID().empty()) { + LOGGER_OBSERVER_SET_TRACE_ID(config_->requestTraceID()); + } + if (!config_->requestGroupTraceID().empty()) { + LOGGER_OBSERVER_SET_GROUP_TRACE_ID(config_->requestGroupTraceID()); + } + LOGGER_OBSERVER_ADD_DESTINATION(config_->activitiesLogUrl()); + +#if defined(HAS_CUPTI) || defined(HAS_ROCTRACER) + if (!cpuOnly_) { + // Enabling CUPTI activity tracing incurs a larger perf hit at first, + // presumably because structures are allocated and initialized, callbacks + // are activated etc. After a while the overhead decreases and stabilizes. + // It's therefore useful to perform some warmup before starting recording. + LOG(INFO) << "Enabling GPU tracing"; + cupti_.setMaxBufferSize(config_->activitiesMaxGpuBufferSize()); + + time_point timestamp; + if (VLOG_IS_ON(1)) { + timestamp = system_clock::now(); + } +#ifdef HAS_CUPTI + cupti_.enableCuptiActivities(config_->selectedActivityTypes()); +#else + cupti_.enableActivities(config_->selectedActivityTypes()); +#endif + if (VLOG_IS_ON(1)) { + auto t2 = system_clock::now(); + addOverheadSample( + setupOverhead_, duration_cast(t2 - timestamp).count()); + } + } +#endif // HAS_CUPTI || HAS_ROCTRACER + + if (profilers_.size() > 0) { + configureChildProfilers(); + } + + if (libkineto::api().client()) { + libkineto::api().client()->warmup(config_->isOpInputsCollectionEnabled()); + } + if (profileStartIter_ >= 0) { + LOG(INFO) << "Tracing starting on iteration = " << profileStartIter_; + } else { + LOG(INFO) << "Tracing starting in " + << duration_cast(profileStartTime_ - now).count() << "s"; + } + + traceBuffers_ = std::make_unique(); + captureWindowStartTime_ = captureWindowEndTime_ = 0; + currentRunloopState_ = RunloopState::Warmup; +} + +void CuptiActivityProfiler::startTraceInternal(const time_point& now) { + captureWindowStartTime_ = libkineto::timeSinceEpoch(now); + VLOG(0) << "Warmup -> CollectTrace"; + for (auto& session: sessions_){ + LOG(INFO) << "Starting child profiler session"; + session->start(); + } + currentRunloopState_ = RunloopState::CollectTrace; +} + +void CuptiActivityProfiler::stopTraceInternal(const time_point& now) { + if (captureWindowEndTime_ == 0) { + captureWindowEndTime_ = libkineto::timeSinceEpoch(now); + } +#if defined(HAS_CUPTI) || defined(HAS_ROCTRACER) + if (!cpuOnly_) { + time_point timestamp; + if (VLOG_IS_ON(1)) { + timestamp = system_clock::now(); + } +#ifdef HAS_CUPTI + cupti_.disableCuptiActivities(config_->selectedActivityTypes()); +#else + cupti_.disableActivities(config_->selectedActivityTypes()); +#endif + if (VLOG_IS_ON(1)) { + auto t2 = system_clock::now(); + addOverheadSample( + setupOverhead_, duration_cast(t2 - timestamp).count()); + } + } +#endif // HAS_CUPTI || HAS_ROCTRACER + + if (currentRunloopState_ == RunloopState::CollectTrace) { + VLOG(0) << "CollectTrace -> ProcessTrace"; + } else { + LOG(WARNING) << "Called stopTrace with state == " << + static_cast::type>( + currentRunloopState_.load()); + } + for (auto& session: sessions_){ + LOG(INFO) << "Stopping child profiler session"; + session->stop(); + } + currentRunloopState_ = RunloopState::ProcessTrace; +} + +void CuptiActivityProfiler::resetInternal() { + resetTraceData(); + currentRunloopState_ = RunloopState::WaitForRequest; +} + +bool CuptiActivityProfiler::isWarmupDone( + const time_point& now, + int64_t currentIter) const { + // is it a time based config + if (profileStartIter_ < 0) { + // qualify that this check is not being called from application step() API + // this avoids races between the step() API and periodically invoked + // profiler run loop step() method + return (currentIter < 0) && (now >= profileStartTime_); + } + // this is an iteration based config + if (currentIter < 0) { + return false; + } + return currentIter >= profileStartIter_; +} + +bool CuptiActivityProfiler::isCollectionDone( + const time_point& now, + int64_t currentIter) const { + // is it a time based config + if (profileStartIter_ < 0) { + // qualify that this check is not being called from application step() API + return (currentIter < 0) && (now >= profileEndTime_); + } + // this is an iteration based config + if (currentIter < 0) { + return false; + } + return currentIter >= profileEndIter_; +} + +const time_point CuptiActivityProfiler::performRunLoopStep( + const time_point& now, + const time_point& nextWakeupTime, + int64_t currentIter) { + auto new_wakeup_time = nextWakeupTime; + bool warmup_done = false, collection_done = false; + + VLOG_IF(1, currentIter >= 0) << "Run loop on application step(), iteration = " + << currentIter; + + switch (currentRunloopState_) { + case RunloopState::WaitForRequest: + VLOG(1) << "State: WaitForRequest"; + // Nothing to do + break; + + case RunloopState::Warmup: + VLOG(1) << "State: Warmup"; + warmup_done = isWarmupDone(now, currentIter); +#if defined(HAS_CUPTI) || defined(HAS_ROCTRACER) + // Flushing can take a while so avoid doing it close to the start time + if (!cpuOnly_ && currentIter < 0 && + (profileStartIter_ >= 0 || nextWakeupTime < profileStartTime_)) { + cupti_.clearActivities(); + } + + if (cupti_.stopCollection) { + // Go to process trace to clear any outstanding buffers etc + LOG(WARNING) << "Trace terminated during warmup"; + std::lock_guard guard(mutex_); + stopTraceInternal(now); + resetInternal(); + VLOG(0) << "Warmup -> WaitForRequest"; + break; + } +#endif // HAS_CUPTI || HAS_ROCTRACER + + if (warmup_done) { + UST_LOGGER_MARK_COMPLETED(kWarmUpStage); + if (profileStartIter_ < 0 && + (now > profileStartTime_ + milliseconds(10))) { + LOG(WARNING) + << "Tracing started " + << duration_cast(now - profileStartTime_).count() + << "ms late!"; + } else { + LOG(INFO) << "Tracing started"; + } + startTrace(now); + if (libkineto::api().client()) { + libkineto::api().client()->start(); + } + if (nextWakeupTime > profileEndTime_) { + new_wakeup_time = profileEndTime_; + } + } else if (nextWakeupTime > profileStartTime_) { + new_wakeup_time = profileStartTime_; + } + + break; + + case RunloopState::CollectTrace: + VLOG(1) << "State: CollectTrace"; + // captureWindowStartTime_ can be set by external threads, + // so recompute end time. + // FIXME: Is this a good idea for synced start? + if (profileStartIter_ < 0) { + std::lock_guard guard(mutex_); + profileEndTime_ = time_point( + microseconds(captureWindowStartTime_)) + + config_->activitiesDuration(); + } + + collection_done = isCollectionDone(now, currentIter); + + // TODO revisit stopCollection_ is not used right now + if (collection_done || stopCollection_.exchange(false) +#if defined(HAS_CUPTI) || defined(HAS_ROCTRACER) + || cupti_.stopCollection +#endif // HAS_CUPTI || HAS_ROCTRACER + ){ + // Update runloop state first to prevent further updates to shared state + LOG(INFO) << "Tracing complete."; + if (currentIter > 0) { + LOG(INFO) << "This state change was invoked by application's step() call"; + } + // FIXME: Need to communicate reason for stopping on errors + if (libkineto::api().client()) { + libkineto::api().client()->stop(); + } + std::lock_guard guard(mutex_); + stopTraceInternal(now); + VLOG_IF(0, collection_done) << "Reached profile end time"; + + UST_LOGGER_MARK_COMPLETED(kCollectionStage); + } else if (profileStartIter_ >= 0) { + // nothing to do here + } else if (now < profileEndTime_ && profileEndTime_ < nextWakeupTime) { + new_wakeup_time = profileEndTime_; + } + + break; + + case RunloopState::ProcessTrace: + VLOG(1) << "State: ProcessTrace"; + // skip this state transition if it called from the step() api + // of the profiler. + // else it could lead to a race between the profiler thread and an + // application thread calling step() + if (currentIter >= 0) { + return new_wakeup_time; + } + // FIXME: Probably want to allow interruption here + // for quickly handling trace request via synchronous API + std::lock_guard guard(mutex_); + processTraceInternal(*logger_); + UST_LOGGER_MARK_COMPLETED(kPostProcessingStage); + resetInternal(); + VLOG(0) << "ProcessTrace -> WaitForRequest"; + break; + } + + return new_wakeup_time; +} + +void CuptiActivityProfiler::finalizeTrace(const Config& config, ActivityLogger& logger) { + LOG(INFO) << "Recorded nets:"; + { + for (const auto& it : iterationCountMap_) { + LOG(INFO) << it.first << ": " << it.second << " iterations"; + } + iterationCountMap_.clear(); + } + + // Process names + int32_t pid = processId(); + string process_name = processName(pid); + if (!process_name.empty()) { + logger.handleDeviceInfo( + {pid, process_name, "CPU"}, captureWindowStartTime_); + if (!cpuOnly_) { + // GPU events use device id as pid (0-7). + constexpr int kMaxGpuCount = 8; + for (int gpu = 0; gpu < kMaxGpuCount; gpu++) { + logger.handleDeviceInfo( + {gpu, process_name, fmt::format("GPU {}", gpu)}, + captureWindowStartTime_); + } + } + } + + // Thread & stream info + for (auto pair : resourceInfo_) { + const auto& resource = pair.second; + logger.handleResourceInfo(resource, captureWindowStartTime_); + } + + for (const auto& iterations : traceSpans_) { + for (const auto& span_pair : iterations.second) { + const TraceSpan& gpu_span = span_pair.second; + if (gpu_span.opCount > 0) { + logger.handleTraceSpan(gpu_span); + } + } + } + + // Overhead info + overheadInfo_.push_back(ActivityLogger::OverheadInfo("CUPTI Overhead")); + for(const auto& info : overheadInfo_) { + logger.handleOverheadInfo(info, captureWindowStartTime_); + } + + gpuUserEventMap_.logEvents(&logger); + +#if !USE_GOOGLE_LOG + // Save logs from LoggerCollector objects into Trace metadata. + auto LoggerMD = loggerCollectorMetadata_->extractCollectorMetadata(); + std::unordered_map> LoggerMDString; + for (auto& md : LoggerMD) { + LoggerMDString[toString(md.first)] = md.second; + } +#endif // !USE_GOOGLE_LOG + + logger.finalizeTrace(config, std::move(traceBuffers_), captureWindowEndTime_, LoggerMDString); +} + +void CuptiActivityProfiler::resetTraceData() { +#if defined(HAS_CUPTI) || defined(HAS_ROCTRACER) + if (!cpuOnly_) { + cupti_.clearActivities(); + } +#endif // HAS_CUPTI || HAS_ROCTRACER + activityMap_.clear(); + cpuCorrelationMap_.clear(); + correlatedCudaActivities_.clear(); + gpuUserEventMap_.clear(); + traceSpans_.clear(); + clientActivityTraceMap_.clear(); + traceBuffers_ = nullptr; + metadata_.clear(); + sessions_.clear(); +#if !USE_GOOGLE_LOG + Logger::removeLoggerObserver(loggerCollectorMetadata_.get()); +#endif // !USE_GOOGLE_LOG +} + + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/CuptiActivityProfiler.h b/tb_plugins/profiling/libkineto/src/CuptiActivityProfiler.h new file mode 100644 index 0000000000000000000000000000000000000000..208833a4db720429982a63ed72ffa4762ef00bd0 --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/CuptiActivityProfiler.h @@ -0,0 +1,364 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// TODO(T90238193) +// @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude +#include "ThreadUtil.h" +#include "TraceSpan.h" +#include "libkineto.h" +#include "output_base.h" +#include "GenericTraceActivity.h" +#include "IActivityProfiler.h" +#include "LoggerCollector.h" + +namespace KINETO_NAMESPACE { + +class Config; +class CuptiActivityApi; +class RoctracerActivityApi; + +class CuptiActivityProfiler { + public: + CuptiActivityProfiler(CuptiActivityApi& cupti, bool cpuOnly); + CuptiActivityProfiler(RoctracerActivityApi& rai, bool cpuOnly); + CuptiActivityProfiler(const CuptiActivityProfiler&) = delete; + CuptiActivityProfiler& operator=(const CuptiActivityProfiler&) = delete; + + bool isActive() const { + return currentRunloopState_ != RunloopState::WaitForRequest; + } + + // Invoke at a regular interval to perform profiling activities. + // When not active, an interval of 1-5 seconds is probably fine, + // depending on required warm-up time and delayed start time. + // When active, it's a good idea to invoke more frequently to stay below + // memory usage limit (ACTIVITIES_MAX_GPU_BUFFER_SIZE_MB) during warmup. + const std::chrono::time_point performRunLoopStep( + const std::chrono::time_point& now, + const std::chrono::time_point& nextWakeupTime, + int64_t currentIter = -1); + + // Used for async requests + void setLogger(ActivityLogger* logger) { + logger_ = logger; + } + + // Synchronous control API + void startTrace( + const std::chrono::time_point& now) { + std::lock_guard guard(mutex_); + startTraceInternal(now); + } + + void stopTrace(const std::chrono::time_point& now) { + std::lock_guard guard(mutex_); + stopTraceInternal(now); + } + + // Process CPU and GPU traces + void processTrace(ActivityLogger& logger) { + std::lock_guard guard(mutex_); + processTraceInternal(logger); + } + + void reset() { + std::lock_guard guard(mutex_); + resetInternal(); + } + + // Set up profiler as specified in config. + void configure( + const Config& config, + const std::chrono::time_point& now); + + // Registered with client API to pass CPU trace events over + void transferCpuTrace( + std::unique_ptr cpuTrace); + + Config& config() { + return *config_; + } + + inline void recordThreadInfo() { + int32_t sysTid = systemThreadId(); + // Note we're using the lower 32 bits of the (opaque) pthread id + // as key, because that's what CUPTI records. + int32_t tid = threadId(); + int32_t pid = processId(); + std::lock_guard guard(mutex_); + recordThreadInfo(sysTid, tid, pid); + } + + // T107508020: We can deprecate the recordThreadInfo(void) once we optimized profiler_kineto + void recordThreadInfo(int32_t sysTid, int32_t tid, int32_t pid) { + if (resourceInfo_.find({pid, tid}) == resourceInfo_.end()) { + resourceInfo_.emplace( + std::make_pair(pid, tid), + ActivityLogger::ResourceInfo( + pid, + sysTid, + sysTid, // sortindex + fmt::format("thread {} ({})", sysTid, getThreadName()))); + } + } + + void addMetadata(const std::string& key, const std::string& value) { + std::lock_guard guard(mutex_); + metadata_[key] = value; + } + + void addChildActivityProfiler( + std::unique_ptr profiler) { + std::lock_guard guard(mutex_); + profilers_.push_back(std::move(profiler)); + } + + protected: + + using CpuGpuSpanPair = std::pair; + static const CpuGpuSpanPair& defaultTraceSpan(); + + private: + + // Map of gpu activities to user defined events + class GpuUserEventMap { + public: + // Insert a user defined event which maps to the gpu trace activity. + // If the user defined event mapping already exists this will update the + // gpu side span to include the span of gpuTraceActivity. + void insertOrExtendEvent(const ITraceActivity& cpuTraceActivity, + const ITraceActivity& gpuTraceActivity); + // Log out the events to the logger + void logEvents(ActivityLogger *logger); + + void clear() { + streamSpanMap_.clear(); + } + + private: + // device id and stream name + using StreamKey = std::pair; + + // map of correlation id to TraceSpan + using CorrelationSpanMap = + std::unordered_map; + std::map streamSpanMap_; + }; + + GpuUserEventMap gpuUserEventMap_; + // id -> activity* + std::unordered_map activityMap_; + // cuda runtime id -> pytorch op id + // CUPTI provides a mechanism for correlating Cuda events to arbitrary + // external events, e.g.operator activities from PyTorch. + std::unordered_map cpuCorrelationMap_; + // CUDA runtime <-> GPU Activity + std::unordered_map + correlatedCudaActivities_; + std::unordered_map userCorrelationMap_; + + // data structure to collect cuptiActivityFlushAll() latency overhead + struct profilerOverhead { + int64_t overhead; + int cntr; + }; + + bool isWarmupDone( + const std::chrono::time_point& now, + int64_t currentIter) const; + + bool isCollectionDone( + const std::chrono::time_point& now, + int64_t currentIter) const; + + void startTraceInternal( + const std::chrono::time_point& now); + + void stopTraceInternal( + const std::chrono::time_point& now); + + void processTraceInternal(ActivityLogger& logger); + + void resetInternal(); + + void finalizeTrace(const Config& config, ActivityLogger& logger); + + void configureChildProfilers(); + + // Process a single CPU trace + void processCpuTrace( + libkineto::CpuTraceBuffer& cpuTrace, + ActivityLogger& logger); + + // Create resource names for streams + inline void recordStream(int device, int id, const char* postfix) { + if (resourceInfo_.find({device, id}) == resourceInfo_.end()) { + resourceInfo_.emplace( + std::make_pair(device, id), + ActivityLogger::ResourceInfo( + device, id, id, fmt::format( + "stream {} {}", id, postfix))); + } + } + + // Record client trace span for subsequent lookups from activities + // Also creates a corresponding GPU-side span. + CpuGpuSpanPair& recordTraceSpan(TraceSpan& span, int gpuOpCount); + + // Returns true if net name is to be tracked for a specified number of + // iterations. + bool iterationTargetMatch(libkineto::CpuTraceBuffer& trace); + + // net name to id + int netId(const std::string& netName); + + const ITraceActivity* linkedActivity( + int32_t correlationId, + const std::unordered_map& correlationMap); + +#ifdef HAS_CUPTI + // Process generic CUPTI activity + void handleCuptiActivity(const CUpti_Activity* record, ActivityLogger* logger); + + // Process specific GPU activity types + void updateGpuNetSpan(const ITraceActivity& gpuOp); + bool outOfRange(const ITraceActivity& act); + void handleCorrelationActivity( + const CUpti_ActivityExternalCorrelation* correlation); + void handleRuntimeActivity( + const CUpti_ActivityAPI* activity, ActivityLogger* logger); + void handleOverheadActivity( + const CUpti_ActivityOverhead* activity, ActivityLogger* logger); + void handleGpuActivity(const ITraceActivity& act, + ActivityLogger* logger); + template + void handleGpuActivity(const T* act, ActivityLogger* logger); +#endif // HAS_CUPTI + + void resetTraceData(); + + void addOverheadSample(profilerOverhead& counter, int64_t overhead) { + counter.overhead += overhead; + counter.cntr++; + } + int64_t getOverhead(const profilerOverhead& counter) { + if (counter.cntr == 0) { + return 0; + } + return counter.overhead / counter.cntr; + } + + void checkTimestampOrder(const ITraceActivity* act1); + + // On-demand request configuration + std::unique_ptr config_; + + // Logger used during trace processing + ActivityLogger* logger_; + + // Calls to CUPTI is encapsulated behind this interface +#ifdef HAS_ROCTRACER + RoctracerActivityApi& cupti_; // Design failure here +#else + CuptiActivityApi& cupti_; +#endif + + enum class RunloopState { + WaitForRequest, + Warmup, + CollectTrace, + ProcessTrace + }; + + // Start and end time used for triggering and stopping profiling + std::chrono::time_point profileStartTime_; + std::chrono::time_point profileEndTime_; + int64_t profileStartIter_ = -1, profileEndIter_ = -1; + + + // All recorded trace spans, both CPU and GPU + // Trace Id -> list of iterations. + // Using map of lists for the iterator semantics, since we are recording + // pointers to the elements in this structure. + std::map> traceSpans_; + + // Maintain a map of client trace activity to trace span. + // Maps correlation id -> TraceSpan* held by traceSpans_. + using ActivityTraceMap = std::unordered_map; + ActivityTraceMap clientActivityTraceMap_; + + // Cache thread names and system thread ids for pthread ids, + // and stream ids for GPU streams + std::map< + std::pair, + ActivityLogger::ResourceInfo> resourceInfo_; + + std::vector overheadInfo_; + + // the overhead to flush the activity buffer + profilerOverhead flushOverhead_; + // the overhead to enable/disable activity tracking + profilerOverhead setupOverhead_; + + bool cpuOnly_{false}; + + // *************************************************************************** + // Below state is shared with external threads. + // These need to either be atomic, accessed under lock or only used + // by external threads in separate runloop phases from the profiler thread. + // *************************************************************************** + + // Mutex to protect non-atomic access to below state + std::mutex mutex_; + + // Runloop phase + std::atomic currentRunloopState_{RunloopState::WaitForRequest}; + + // Keep track of the start time of the first net in the current trace. + // This is only relevant to Caffe2 as PyTorch does not have nets. + // All CUDA events before this time will be removed + // Can be written by external threads during collection. + int64_t captureWindowStartTime_{0}; + // Similarly, all CUDA API events after the last net event will be removed + int64_t captureWindowEndTime_{0}; + + // span name -> iteration count + std::map iterationCountMap_; + // Flag used to stop tracing from external api callback. + // Needs to be atomic since it's set from a different thread. + std::atomic_bool stopCollection_{false}; + + // Buffers where trace data is stored + std::unique_ptr traceBuffers_; + + // Trace metadata + std::unordered_map metadata_; + + // child activity profilers + std::vector> profilers_; + + // a vector of active profiler plugin sessions + std::vector> sessions_; + + // LoggerCollector to collect all LOGs during the trace +#if !USE_GOOGLE_LOG + std::unique_ptr loggerCollectorMetadata_; +#endif // !USE_GOOGLE_LOG +}; + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/CuptiCallbackApi.cpp b/tb_plugins/profiling/libkineto/src/CuptiCallbackApi.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1876003998dc0c66f882d939ca8100750cfd046a --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/CuptiCallbackApi.cpp @@ -0,0 +1,260 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include "CuptiCallbackApi.h" + +#include +#include +#include +#include +#include + +#ifdef HAS_CUPTI +#include "cupti_call.h" +#endif +#include "Logger.h" + + +namespace KINETO_NAMESPACE { + +// limit on number of handles per callback type +constexpr size_t MAX_CB_FNS_PER_CB = 8; + +// Reader Writer lock types +using ReaderWriterLock = std::shared_timed_mutex; +using ReaderLockGuard = std::shared_lock; +using WriteLockGuard = std::unique_lock; + +static ReaderWriterLock callbackLock_; + +/* Callback Table : + * Overall goal of the design is to optimize the lookup of function + * pointers. The table is structured at two levels and the leaf + * elements in the table are std::list to enable fast access/inserts/deletes + * + * | + * -> cb id 0 -> std::list of callbacks + * ... + * -> cb id n -> std::list of callbacks + * | + * ... + * CallbackTable is the finaly table type above + * See type declrartions in header file. + */ + + +/* callback_switchboard : is the global callback handler we register + * with CUPTI. The goal is to make it as efficient as possible + * to re-direct to the registered callback(s). + * + * Few things to care about : + * a) use if/then switches rather than map/hash structures + * b) avoid dynamic memory allocations + * c) be aware of locking overheads + */ +#ifdef HAS_CUPTI +static void CUPTIAPI callback_switchboard( +#else +static void callback_switchboard( +#endif + void* /* unused */, + CUpti_CallbackDomain domain, + CUpti_CallbackId cbid, + const CUpti_CallbackData* cbInfo) { + + // below statement is likey going to call a mutex + // on the singleton access + CuptiCallbackApi::singleton().__callback_switchboard( + domain, cbid, cbInfo); +} + + +void CuptiCallbackApi::__callback_switchboard( + CUpti_CallbackDomain domain, + CUpti_CallbackId cbid, + const CUpti_CallbackData* cbInfo) { + VLOG(0) << "Callback: domain = " << domain << ", cbid = " << cbid; + CallbackList *cblist = nullptr; + + switch (domain) { + + // add the fastest path for kernel launch callbacks + // as these are the most frequent ones + case CUPTI_CB_DOMAIN_RUNTIME_API: + switch (cbid) { + case CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000: + cblist = &callbacks_.runtime[ + CUDA_LAUNCH_KERNEL - __RUNTIME_CB_DOMAIN_START]; + break; + default: + break; + } + break; + + case CUPTI_CB_DOMAIN_RESOURCE: + switch (cbid) { + case CUPTI_CBID_RESOURCE_CONTEXT_CREATED: + cblist = &callbacks_.resource[ + RESOURCE_CONTEXT_CREATED - __RESOURCE_CB_DOMAIN_START]; + break; + case CUPTI_CBID_RESOURCE_CONTEXT_DESTROY_STARTING: + cblist = &callbacks_.resource[ + RESOURCE_CONTEXT_DESTROYED - __RESOURCE_CB_DOMAIN_START]; + break; + default: + break; + } + break; + + default: + return; + } + + // ignore callbacks that are not handled + if (cblist == nullptr) { + return; + } + + // make a copy of the callback list so we avoid holding lock + // in common case this should be just one func pointer copy + std::array callbacks; + int num_cbs = 0; + { + ReaderLockGuard rl(callbackLock_); + int i = 0; + for (auto it = cblist->begin(); + it != cblist->end() && i < MAX_CB_FNS_PER_CB; + it++, i++) { + callbacks[i] = *it; + } + num_cbs = i; + } + + for (int i = 0; i < num_cbs; i++) { + auto fn = callbacks[i]; + fn(domain, cbid, cbInfo); + } +} + +CuptiCallbackApi& CuptiCallbackApi::singleton() { + static CuptiCallbackApi instance; + return instance; +} + +CuptiCallbackApi::CuptiCallbackApi() { +#ifdef HAS_CUPTI + lastCuptiStatus_ = CUPTI_ERROR_UNKNOWN; + lastCuptiStatus_ = CUPTI_CALL_NOWARN( + cuptiSubscribe(&subscriber_, + (CUpti_CallbackFunc)callback_switchboard, + nullptr)); + + initSuccess_ = (lastCuptiStatus_ == CUPTI_SUCCESS); +#endif +} + +CuptiCallbackApi::CallbackList* CuptiCallbackApi::CallbackTable::lookup( + CUpti_CallbackDomain domain, CuptiCallBackID cbid) { + size_t idx; + + switch (domain) { + + case CUPTI_CB_DOMAIN_RESOURCE: + assert(cbid >= __RESOURCE_CB_DOMAIN_START); + assert(cbid < __RESOURCE_CB_DOMAIN_END); + idx = cbid - __RESOURCE_CB_DOMAIN_START; + return &resource.at(idx); + + case CUPTI_CB_DOMAIN_RUNTIME_API: + assert(cbid >= __RUNTIME_CB_DOMAIN_START); + assert(cbid < __RUNTIME_CB_DOMAIN_END); + idx = cbid - __RUNTIME_CB_DOMAIN_START; + return &runtime.at(idx); + + default: + LOG(WARNING) << " Unsupported callback domain : " << domain; + return nullptr; + } +} + +bool CuptiCallbackApi::registerCallback( + CUpti_CallbackDomain domain, + CuptiCallBackID cbid, + CuptiCallbackFn cbfn) { + CallbackList* cblist = callbacks_.lookup(domain, cbid); + + if (!cblist) { + LOG(WARNING) << "Could not register callback -- domain = " << domain + << " callback id = " << cbid; + return false; + } + + // avoid duplicates + auto it = std::find(cblist->begin(), cblist->end(), cbfn); + if (it != cblist->end()) { + LOG(WARNING) << "Adding duplicate callback -- domain = " << domain + << " callback id = " << cbid; + return true; + } + + if (cblist->size() == MAX_CB_FNS_PER_CB) { + LOG(WARNING) << "Already registered max callback -- domain = " << domain + << " callback id = " << cbid; + } + + WriteLockGuard wl(callbackLock_); + cblist->push_back(cbfn); + return true; +} + +bool CuptiCallbackApi::deleteCallback( + CUpti_CallbackDomain domain, + CuptiCallBackID cbid, + CuptiCallbackFn cbfn) { + CallbackList* cblist = callbacks_.lookup(domain, cbid); + if (!cblist) { + LOG(WARNING) << "Attempting to remove unsupported callback -- domain = " << domain + << " callback id = " << cbid; + return false; + } + + // Locks are not required here as + // https://en.cppreference.com/w/cpp/container/list/erase + // "References and iterators to the erased elements are invalidated. + // Other references and iterators are not affected." + auto it = std::find(cblist->begin(), cblist->end(), cbfn); + if (it == cblist->end()) { + LOG(WARNING) << "Could not find callback to remove -- domain = " << domain + << " callback id = " << cbid; + return false; + } + + WriteLockGuard wl(callbackLock_); + cblist->erase(it); + return true; +} + +bool CuptiCallbackApi::enableCallback( + CUpti_CallbackDomain domain, CUpti_CallbackId cbid) { +#ifdef HAS_CUPTI + if (initSuccess_) { + lastCuptiStatus_ = CUPTI_CALL_NOWARN( + cuptiEnableCallback(1, subscriber_, domain, cbid)); + return (lastCuptiStatus_ == CUPTI_SUCCESS); + } +#endif + return false; +} + +bool CuptiCallbackApi::disableCallback( + CUpti_CallbackDomain domain, CUpti_CallbackId cbid) { +#ifdef HAS_CUPTI + if (initSuccess_) { + lastCuptiStatus_ = CUPTI_CALL_NOWARN( + cuptiEnableCallback(0, subscriber_, domain, cbid)); + return (lastCuptiStatus_ == CUPTI_SUCCESS); + } +#endif + return false; +} + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/CuptiCallbackApi.h b/tb_plugins/profiling/libkineto/src/CuptiCallbackApi.h new file mode 100644 index 0000000000000000000000000000000000000000..4526f3750b4a134bc888843b8ff347a1f2bf8d5f --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/CuptiCallbackApi.h @@ -0,0 +1,130 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#include +#ifdef HAS_CUPTI +#include +#endif +#include +#include +#include +#include +#include + +// TODO(T90238193) +// @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude +#include "CuptiCallbackApiMock.h" + +namespace KINETO_NAMESPACE { + +using namespace libkineto; + + +/* CuptiCallbackApi : Provides an abstraction over CUPTI callback + * interface. This enables various callback functions to be registered + * with this class. The class registers a global callback handler that + * redirects to the respective callbacks. + * + * Note: one design choice we made is to only support simple function pointers + * in order to speed up the implementation for fast path. + */ + +using CuptiCallbackFn = void(*)( + CUpti_CallbackDomain domain, + CUpti_CallbackId cbid, + const CUpti_CallbackData* cbInfo); + + +class CuptiCallbackApi { + + public: + + /* Global list of supported callback ids + * use the class namespace to avoid confusing with CUPTI enums*/ + enum CuptiCallBackID { + CUDA_LAUNCH_KERNEL = 0, + // can possibly support more callback ids per domain + // + __RUNTIME_CB_DOMAIN_START = CUDA_LAUNCH_KERNEL, + + // Callbacks under Resource CB domain + RESOURCE_CONTEXT_CREATED, + RESOURCE_CONTEXT_DESTROYED, + + __RUNTIME_CB_DOMAIN_END = RESOURCE_CONTEXT_CREATED, + __RESOURCE_CB_DOMAIN_START = RESOURCE_CONTEXT_CREATED, + + __RESOURCE_CB_DOMAIN_END = RESOURCE_CONTEXT_DESTROYED + 1, + }; + + + CuptiCallbackApi(const CuptiCallbackApi&) = delete; + CuptiCallbackApi& operator=(const CuptiCallbackApi&) = delete; + + static CuptiCallbackApi& singleton(); + + bool initSuccess() const { + return initSuccess_; + } + +#ifdef HAS_CUPTI + CUptiResult getCuptiStatus() const { + return lastCuptiStatus_; + } +#endif + + bool registerCallback( + CUpti_CallbackDomain domain, + CuptiCallBackID cbid, + CuptiCallbackFn cbfn); + + // returns false if callback was not found + bool deleteCallback( + CUpti_CallbackDomain domain, + CuptiCallBackID cbid, + CuptiCallbackFn cbfn); + + bool enableCallback(CUpti_CallbackDomain domain, CUpti_CallbackId cbid); + bool disableCallback(CUpti_CallbackDomain domain, CUpti_CallbackId cbid); + + + // Please do not use this method. This has to be exposed as public + // so it is accessible from the callback handler + void __callback_switchboard( + CUpti_CallbackDomain domain, + CUpti_CallbackId cbid, + const CUpti_CallbackData* cbInfo); + + private: + + explicit CuptiCallbackApi(); + + // For callback table design overview see the .cpp file + using CallbackList = std::list; + + // level 2 tables sizes are known at compile time + constexpr static size_t RUNTIME_CB_DOMAIN_SIZE + = (__RUNTIME_CB_DOMAIN_END - __RUNTIME_CB_DOMAIN_START); + + constexpr static size_t RESOURCE_CB_DOMAIN_SIZE + = (__RESOURCE_CB_DOMAIN_END - __RESOURCE_CB_DOMAIN_START); + + // level 1 table is a struct + struct CallbackTable { + std::array runtime; + std::array resource; + + CallbackList* lookup(CUpti_CallbackDomain domain, CuptiCallBackID cbid); + }; + + CallbackTable callbacks_; + bool initSuccess_ = false; + +#ifdef HAS_CUPTI + CUptiResult lastCuptiStatus_; + CUpti_SubscriberHandle subscriber_; +#endif +}; + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/CuptiCallbackApiMock.h b/tb_plugins/profiling/libkineto/src/CuptiCallbackApiMock.h new file mode 100644 index 0000000000000000000000000000000000000000..fd51267274f99a0c9949eaac6fdae2dff917c7a0 --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/CuptiCallbackApiMock.h @@ -0,0 +1,32 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +// Provides data structures to mock CUPTI Callback API +#ifndef HAS_CUPTI + +enum CUpti_CallbackDomain { + CUPTI_CB_DOMAIN_RESOURCE, + CUPTI_CB_DOMAIN_RUNTIME_API, +}; +enum CUpti_CallbackId { + CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000, + CUPTI_CBID_RESOURCE_CONTEXT_CREATED, + CUPTI_CBID_RESOURCE_CONTEXT_DESTROY_STARTING, +}; + +using CUcontext = void*; + +struct CUpti_ResourceData { + CUcontext context; +}; + +constexpr int CUPTI_API_ENTER = 0; +constexpr int CUPTI_API_EXIT = 0; + +struct CUpti_CallbackData { + CUcontext context; + const char* symbolName; + int callbackSite; +}; +#endif // HAS_CUPTI diff --git a/tb_plugins/profiling/libkineto/src/CuptiEventApi.cpp b/tb_plugins/profiling/libkineto/src/CuptiEventApi.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7f1d48c1d00bb7defb6b622c13da55da99312a3b --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/CuptiEventApi.cpp @@ -0,0 +1,112 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include "CuptiEventApi.h" + +#include + +#include "Logger.h" +#include "cupti_call.h" + +using namespace std::chrono; +using std::vector; + +namespace KINETO_NAMESPACE { + +CuptiEventApi::CuptiEventApi(CUcontext context) + : context_(context) { + CUPTI_CALL(cuptiGetDeviceId(context_, (uint32_t*)&device_)); +} + +CUpti_EventGroupSets* CuptiEventApi::createGroupSets( + vector& ids) { + CUpti_EventGroupSets* group_sets = nullptr; + CUptiResult res = CUPTI_CALL(cuptiEventGroupSetsCreate( + context_, sizeof(CUpti_EventID) * ids.size(), ids.data(), &group_sets)); + + if (res != CUPTI_SUCCESS || group_sets == nullptr) { + const char* errstr = nullptr; + CUPTI_CALL(cuptiGetResultString(res, &errstr)); + throw std::system_error(EINVAL, std::generic_category(), errstr); + } + + return group_sets; +} + +void CuptiEventApi::destroyGroupSets(CUpti_EventGroupSets* sets) { + CUPTI_CALL(cuptiEventGroupSetsDestroy(sets)); +} + +bool CuptiEventApi::setContinuousMode() { + // Avoid logging noise for CUPTI_ERROR_LEGACY_PROFILER_NOT_SUPPORTED + CUptiResult res = CUPTI_CALL_NOWARN(cuptiSetEventCollectionMode( + context_, CUPTI_EVENT_COLLECTION_MODE_CONTINUOUS)); + if (res == CUPTI_ERROR_LEGACY_PROFILER_NOT_SUPPORTED) { + return false; + } + // Log warning on other errors + CUPTI_CALL(res); + return (res == CUPTI_SUCCESS); +} + +void CuptiEventApi::enablePerInstance(CUpti_EventGroup eventGroup) { + uint32_t profile_all = 1; + CUPTI_CALL(cuptiEventGroupSetAttribute( + eventGroup, + CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES, + sizeof(profile_all), + &profile_all)); +} + +uint32_t CuptiEventApi::instanceCount(CUpti_EventGroup eventGroup) { + uint32_t instance_count = 0; + size_t s = sizeof(instance_count); + CUPTI_CALL(cuptiEventGroupGetAttribute( + eventGroup, CUPTI_EVENT_GROUP_ATTR_INSTANCE_COUNT, &s, &instance_count)); + return instance_count; +} + +void CuptiEventApi::enableGroupSet(CUpti_EventGroupSet& set) { + CUptiResult res = CUPTI_CALL_NOWARN(cuptiEventGroupSetEnable(&set)); + if (res != CUPTI_SUCCESS) { + const char* errstr = nullptr; + CUPTI_CALL(cuptiGetResultString(res, &errstr)); + throw std::system_error(EIO, std::generic_category(), errstr); + } +} + +void CuptiEventApi::disableGroupSet(CUpti_EventGroupSet& set) { + CUPTI_CALL(cuptiEventGroupSetDisable(&set)); +} + +void CuptiEventApi::readEvent( + CUpti_EventGroup grp, + CUpti_EventID id, + vector& vals) { + size_t s = sizeof(int64_t) * vals.size(); + CUPTI_CALL(cuptiEventGroupReadEvent( + grp, + CUPTI_EVENT_READ_FLAG_NONE, + id, + &s, + reinterpret_cast(vals.data()))); +} + +vector CuptiEventApi::eventsInGroup(CUpti_EventGroup grp) { + uint32_t group_size = 0; + size_t s = sizeof(group_size); + CUPTI_CALL(cuptiEventGroupGetAttribute( + grp, CUPTI_EVENT_GROUP_ATTR_NUM_EVENTS, &s, &group_size)); + size_t events_size = group_size * sizeof(CUpti_EventID); + vector res(group_size); + CUPTI_CALL(cuptiEventGroupGetAttribute( + grp, CUPTI_EVENT_GROUP_ATTR_EVENTS, &events_size, res.data())); + return res; +} + +CUpti_EventID CuptiEventApi::eventId(const std::string& name) { + CUpti_EventID id{0}; + CUPTI_CALL(cuptiEventGetIdFromName(device_, name.c_str(), &id)); + return id; +} + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/CuptiEventApi.h b/tb_plugins/profiling/libkineto/src/CuptiEventApi.h new file mode 100644 index 0000000000000000000000000000000000000000..79610f93f0ecfa62a9508d4caddfa876518169d3 --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/CuptiEventApi.h @@ -0,0 +1,49 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#include +#include +#include + +namespace KINETO_NAMESPACE { + +// C++ interface to CUPTI Events C API. +// Virtual methods are here mainly to allow easier testing. +class CuptiEventApi { + public: + explicit CuptiEventApi(CUcontext context_); + virtual ~CuptiEventApi() {} + + CUdevice device() { + return device_; + } + + virtual CUpti_EventGroupSets* createGroupSets( + std::vector& ids); + virtual void destroyGroupSets(CUpti_EventGroupSets* sets); + + virtual bool setContinuousMode(); + + virtual void enablePerInstance(CUpti_EventGroup eventGroup); + virtual uint32_t instanceCount(CUpti_EventGroup eventGroup); + + virtual void enableGroupSet(CUpti_EventGroupSet& set); + virtual void disableGroupSet(CUpti_EventGroupSet& set); + + virtual void + readEvent(CUpti_EventGroup g, CUpti_EventID id, std::vector& vals); + virtual std::vector eventsInGroup(CUpti_EventGroup g); + + virtual CUpti_EventID eventId(const std::string& name); + + protected: + // Unit testing + CuptiEventApi() : context_(nullptr), device_(0) {} + + private: + CUcontext context_; + CUdevice device_; +}; + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/CuptiMetricApi.cpp b/tb_plugins/profiling/libkineto/src/CuptiMetricApi.cpp new file mode 100644 index 0000000000000000000000000000000000000000..36401e7434108d1da079aa4ba0264192c5d62838 --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/CuptiMetricApi.cpp @@ -0,0 +1,107 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include "CuptiMetricApi.h" + +#include + +#include "Logger.h" +#include "cupti_call.h" + +using namespace std::chrono; +using std::vector; + +namespace KINETO_NAMESPACE { + +CUpti_MetricID CuptiMetricApi::idFromName(const std::string& name) { + CUpti_MetricID metric_id{~0u}; + CUptiResult res = + CUPTI_CALL(cuptiMetricGetIdFromName(device_, name.c_str(), &metric_id)); + if (res == CUPTI_ERROR_INVALID_METRIC_NAME) { + LOG(WARNING) << "Invalid metric name: " << name; + } + return metric_id; +} + +// Return a map of event IDs and names for a given metric id. +// Note that many events don't have a name. In that case the name will +// be set to the empty string. +std::map CuptiMetricApi::events( + CUpti_MetricID metric_id) { + uint32_t num_events = 0; + CUPTI_CALL(cuptiMetricGetNumEvents(metric_id, &num_events)); + vector ids(num_events); + size_t array_size = num_events * sizeof(CUpti_EventID); + CUPTI_CALL(cuptiMetricEnumEvents(metric_id, &array_size, ids.data())); + std::map res; + for (CUpti_EventID id : ids) { + // Attempt to lookup name from CUPTI + constexpr size_t kMaxEventNameLength = 64; + char cupti_name[kMaxEventNameLength]; + size_t size = kMaxEventNameLength; + CUPTI_CALL( + cuptiEventGetAttribute(id, CUPTI_EVENT_ATTR_NAME, &size, cupti_name)); + cupti_name[kMaxEventNameLength - 1] = 0; + + // CUPTI "helpfully" returns "event_name" when the event is unnamed. + if (size > 0 && strcmp(cupti_name, "event_name") != 0) { + res.emplace(id, cupti_name); + } else { + res.emplace(id, ""); + } + } + return res; +} + +CUpti_MetricValueKind CuptiMetricApi::valueKind(CUpti_MetricID metric) { + CUpti_MetricValueKind res{CUPTI_METRIC_VALUE_KIND_FORCE_INT}; + size_t value_kind_size = sizeof(res); + CUPTI_CALL(cuptiMetricGetAttribute( + metric, CUPTI_METRIC_ATTR_VALUE_KIND, &value_kind_size, &res)); + return res; +} + +CUpti_MetricEvaluationMode CuptiMetricApi::evaluationMode( + CUpti_MetricID metric) { + CUpti_MetricEvaluationMode eval_mode{ + CUPTI_METRIC_EVALUATION_MODE_PER_INSTANCE}; + size_t eval_mode_size = sizeof(eval_mode); + CUPTI_CALL(cuptiMetricGetAttribute( + metric, CUPTI_METRIC_ATTR_EVALUATION_MODE, &eval_mode_size, &eval_mode)); + return eval_mode; +} + +// FIXME: Consider caching value kind here +SampleValue CuptiMetricApi::calculate( + CUpti_MetricID metric, + CUpti_MetricValueKind kind, + vector& events, + vector& values, + int64_t duration) { + CUpti_MetricValue metric_value; + CUPTI_CALL(cuptiMetricGetValue( + device_, + metric, + events.size() * sizeof(CUpti_EventID), + events.data(), + values.size() * sizeof(int64_t), + reinterpret_cast(values.data()), + duration, + &metric_value)); + + switch (kind) { + case CUPTI_METRIC_VALUE_KIND_DOUBLE: + case CUPTI_METRIC_VALUE_KIND_PERCENT: + return SampleValue(metric_value.metricValueDouble); + case CUPTI_METRIC_VALUE_KIND_UINT64: + case CUPTI_METRIC_VALUE_KIND_INT64: + case CUPTI_METRIC_VALUE_KIND_THROUGHPUT: + return SampleValue(metric_value.metricValueUint64); + case CUPTI_METRIC_VALUE_KIND_UTILIZATION_LEVEL: + return SampleValue((int)metric_value.metricValueUtilizationLevel); + default: + assert(false); + } + return SampleValue(-1); +} + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/CuptiMetricApi.h b/tb_plugins/profiling/libkineto/src/CuptiMetricApi.h new file mode 100644 index 0000000000000000000000000000000000000000..f45d38cd6169dc7fd30208dbb7dac09fd8a9dee5 --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/CuptiMetricApi.h @@ -0,0 +1,38 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#include + +#include +#include + +#include "SampleListener.h" + +namespace KINETO_NAMESPACE { + +// C++ interface to CUPTI Metrics C API. +// Virtual methods are here mainly to allow easier testing. +class CuptiMetricApi { + public: + explicit CuptiMetricApi(CUdevice device) : device_(device) {} + virtual ~CuptiMetricApi() {} + + virtual CUpti_MetricID idFromName(const std::string& name); + virtual std::map events(CUpti_MetricID metric_id); + + virtual CUpti_MetricValueKind valueKind(CUpti_MetricID metric); + virtual CUpti_MetricEvaluationMode evaluationMode(CUpti_MetricID metric); + + virtual SampleValue calculate( + CUpti_MetricID metric, + CUpti_MetricValueKind kind, + std::vector& events, + std::vector& values, + int64_t duration); + + private: + CUdevice device_; +}; + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/CuptiNvPerfMetric.cpp b/tb_plugins/profiling/libkineto/src/CuptiNvPerfMetric.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d1b08ab2c13d0615221e71f43f07c3d3fe102a2f --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/CuptiNvPerfMetric.cpp @@ -0,0 +1,504 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#ifdef HAS_CUPTI +#include +#if defined(CUDART_VERSION) && CUDART_VERSION > 10000 && CUDART_VERSION < 11040 +#include +#include +#include +#endif // cuda version > 10.00 and < 11.04 +#endif // HAS_CUPTI + +// TODO(T90238193) +// @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude +#include "ScopeExit.h" +#include "CuptiNvPerfMetric.h" +#include "Logger.h" + +namespace KINETO_NAMESPACE { + +// Add a namespace to isolate these utility functions that are only +// going to be used by the CuptiRangeProfiler. These included calls +// to NVIDIA PerfWorks APIs. +namespace nvperf { + + +// Largely based on NVIDIA sample code provided with CUDA release +// files Metric.cpp and Eval.cpp + +// ------------------------------------------------- +// Metric and Counter Data Configuration +// ------------------------------------------------- + + +// Note: Be carful before modifying the code below. There is a specific +// sequence one needs to follow to program the metrics else things may +// stop working. We tried to keep the flow consistent with the example +// code from NVIDIA. Since most of the programmability comes from +// the CUPTI profiler metric names this should be okay. + +// Only supported on CUDA RT Version between 10.0 and 11.04. +// After CUDA RT 11.04, the structure has changed. +// TODO update the structure NVPA_RawMetricsConfig to support 11.04 +#if defined(CUDART_VERSION) && CUDART_VERSION > 10000 && CUDART_VERSION < 11040 + +bool getRawMetricRequests( + NVPA_MetricsContext* metricsContext, + std::vector metricNames, + std::vector& rawMetricsDeps, + std::vector& rawMetricRequests) { + bool isolated = true; + /* Bug in collection with collection of metrics without instances, keep it + * to true*/ + bool keepInstances = true; + + for (const auto& metricName : metricNames) { + + NVPW_MetricsContext_GetMetricProperties_Begin_Params + getMetricPropertiesBeginParams = { + NVPW_MetricsContext_GetMetricProperties_Begin_Params_STRUCT_SIZE, nullptr}; + getMetricPropertiesBeginParams.pMetricsContext = metricsContext; + getMetricPropertiesBeginParams.pMetricName = metricName.c_str(); + + if (!NVPW_CALL( + NVPW_MetricsContext_GetMetricProperties_Begin( + &getMetricPropertiesBeginParams))) { + return false; + } + + for (const char** metricDepsIt = + getMetricPropertiesBeginParams.ppRawMetricDependencies; + *metricDepsIt; + ++metricDepsIt) { + rawMetricsDeps.push_back(*metricDepsIt); + } + + NVPW_MetricsContext_GetMetricProperties_End_Params + getMetricPropertiesEndParams = { + NVPW_MetricsContext_GetMetricProperties_End_Params_STRUCT_SIZE, nullptr}; + getMetricPropertiesEndParams.pMetricsContext = metricsContext; + + if (!NVPW_CALL(NVPW_MetricsContext_GetMetricProperties_End( + &getMetricPropertiesEndParams))) { + return false; + } + } + + for (const auto& rawMetricName : rawMetricsDeps) { + NVPA_RawMetricRequest metricRequest = {NVPA_RAW_METRIC_REQUEST_STRUCT_SIZE, nullptr}; + metricRequest.pMetricName = rawMetricName.c_str(); + metricRequest.isolated = isolated; + metricRequest.keepInstances = keepInstances; + rawMetricRequests.push_back(metricRequest); + VLOG(1) << "Adding raw metric struct : raw metric = " << rawMetricName + << " isolated = " << isolated << " keepinst = " << keepInstances; + } + + if (rawMetricRequests.size() == 0) { + LOG(WARNING) << "CUPTI Profiler was unable to configure any metrics"; + return false; + } + return true; +} + +// Setup CUPTI Profiler Config Image +bool getProfilerConfigImage( + const std::string& chipName, + const std::vector& metricNames, + std::vector& configImage, + const uint8_t* counterAvailabilityImage) { + + NVPW_CUDA_MetricsContext_Create_Params metricsContextCreateParams = { + NVPW_CUDA_MetricsContext_Create_Params_STRUCT_SIZE, nullptr}; + metricsContextCreateParams.pChipName = chipName.c_str(); + + if (!NVPW_CALL( + NVPW_CUDA_MetricsContext_Create(&metricsContextCreateParams))) { + return false; + } + + NVPW_MetricsContext_Destroy_Params metricsContextDestroyParams = { + NVPW_MetricsContext_Destroy_Params_STRUCT_SIZE, nullptr}; + metricsContextDestroyParams.pMetricsContext = + metricsContextCreateParams.pMetricsContext; + + SCOPE_EXIT([&]() { + NVPW_MetricsContext_Destroy( + (NVPW_MetricsContext_Destroy_Params*)&metricsContextDestroyParams); + }); + + // Get all raw metrics required for given metricNames list + std::vector rawMetricRequests; + + // note: we need a variable at this functions scope to hold the string + // pointers for underlying C char arrays. + std::vector rawMetricDeps; + + if (!getRawMetricRequests( + metricsContextCreateParams.pMetricsContext, + metricNames, + rawMetricDeps, + rawMetricRequests)) { + return false; + } + + NVPA_RawMetricsConfigOptions metricsConfigOptions = { + NVPA_RAW_METRICS_CONFIG_OPTIONS_STRUCT_SIZE, nullptr}; + metricsConfigOptions.activityKind = NVPA_ACTIVITY_KIND_PROFILER; + metricsConfigOptions.pChipName = chipName.c_str(); + NVPA_RawMetricsConfig* rawMetricsConfig; + if (!NVPW_CALL( + NVPA_RawMetricsConfig_Create( + &metricsConfigOptions, &rawMetricsConfig))) { + return false; + } + + // TODO check if this is required + if (counterAvailabilityImage) { + NVPW_RawMetricsConfig_SetCounterAvailability_Params + setCounterAvailabilityParams = { + NVPW_RawMetricsConfig_SetCounterAvailability_Params_STRUCT_SIZE, nullptr}; + setCounterAvailabilityParams.pRawMetricsConfig = rawMetricsConfig; + setCounterAvailabilityParams.pCounterAvailabilityImage = + counterAvailabilityImage; + if (!NVPW_CALL( + NVPW_RawMetricsConfig_SetCounterAvailability( + &setCounterAvailabilityParams))) { + return false; + } + } + + NVPW_RawMetricsConfig_Destroy_Params rawMetricsConfigDestroyParams = { + NVPW_RawMetricsConfig_Destroy_Params_STRUCT_SIZE, nullptr}; + rawMetricsConfigDestroyParams.pRawMetricsConfig = rawMetricsConfig; + SCOPE_EXIT([&]() { + NVPW_RawMetricsConfig_Destroy( + (NVPW_RawMetricsConfig_Destroy_Params*)&rawMetricsConfigDestroyParams); + }); + + // Start a Raw Metric Pass group + NVPW_RawMetricsConfig_BeginPassGroup_Params beginPassGroupParams = { + NVPW_RawMetricsConfig_BeginPassGroup_Params_STRUCT_SIZE, nullptr}; + beginPassGroupParams.pRawMetricsConfig = rawMetricsConfig; + if (!NVPW_CALL( + NVPW_RawMetricsConfig_BeginPassGroup(&beginPassGroupParams))) { + return false; + } + + // Add all raw metrics + NVPW_RawMetricsConfig_AddMetrics_Params addMetricsParams = { + NVPW_RawMetricsConfig_AddMetrics_Params_STRUCT_SIZE, nullptr}; + addMetricsParams.pRawMetricsConfig = rawMetricsConfig; + addMetricsParams.pRawMetricRequests = rawMetricRequests.data(); + addMetricsParams.numMetricRequests = rawMetricRequests.size(); + if (!NVPW_CALL( + NVPW_RawMetricsConfig_AddMetrics(&addMetricsParams))) { + return false; + } + + // End pass group + NVPW_RawMetricsConfig_EndPassGroup_Params endPassGroupParams = { + NVPW_RawMetricsConfig_EndPassGroup_Params_STRUCT_SIZE, nullptr}; + endPassGroupParams.pRawMetricsConfig = rawMetricsConfig; + if (!NVPW_CALL( + NVPW_RawMetricsConfig_EndPassGroup(&endPassGroupParams))) { + return false; + } + + // Setup Config Image generation + NVPW_RawMetricsConfig_GenerateConfigImage_Params generateConfigImageParams = { + NVPW_RawMetricsConfig_GenerateConfigImage_Params_STRUCT_SIZE, nullptr}; + generateConfigImageParams.pRawMetricsConfig = rawMetricsConfig; + if (!NVPW_CALL( + NVPW_RawMetricsConfig_GenerateConfigImage(&generateConfigImageParams))) { + return false; + } + + // Get the Config Image size... nearly there + NVPW_RawMetricsConfig_GetConfigImage_Params getConfigImageParams = { + NVPW_RawMetricsConfig_GetConfigImage_Params_STRUCT_SIZE, nullptr}; + getConfigImageParams.pRawMetricsConfig = rawMetricsConfig; + getConfigImageParams.bytesAllocated = 0; + getConfigImageParams.pBuffer = nullptr; + if (!NVPW_CALL( + NVPW_RawMetricsConfig_GetConfigImage(&getConfigImageParams))) { + return false; + } + + configImage.resize(getConfigImageParams.bytesCopied); + + // Write the Config image binary + getConfigImageParams.bytesAllocated = configImage.size(); + getConfigImageParams.pBuffer = configImage.data(); + if (!NVPW_CALL( + NVPW_RawMetricsConfig_GetConfigImage(&getConfigImageParams))) { + return false; + } + + return true; +} + +bool getCounterDataPrefixImage( + const std::string& chipName, + const std::vector& metricNames, + std::vector& counterDataImagePrefix) { + + NVPW_CUDA_MetricsContext_Create_Params metricsContextCreateParams = { + NVPW_CUDA_MetricsContext_Create_Params_STRUCT_SIZE, nullptr}; + metricsContextCreateParams.pChipName = chipName.c_str(); + + if (!NVPW_CALL( + NVPW_CUDA_MetricsContext_Create(&metricsContextCreateParams))) { + return false; + } + + NVPW_MetricsContext_Destroy_Params metricsContextDestroyParams = { + NVPW_MetricsContext_Destroy_Params_STRUCT_SIZE, nullptr}; + metricsContextDestroyParams.pMetricsContext = + metricsContextCreateParams.pMetricsContext; + + + SCOPE_EXIT([&]() { + NVPW_MetricsContext_Destroy( + (NVPW_MetricsContext_Destroy_Params*)&metricsContextDestroyParams); + }); + + // Get all raw metrics required for given metricNames list + std::vector rawMetricRequests; + + // note: we need a variable at this functions scope to hold the string + // pointers for underlying C char arrays. + std::vector rawMetricDeps; + + if (!getRawMetricRequests( + metricsContextCreateParams.pMetricsContext, + metricNames, + rawMetricDeps, + rawMetricRequests)) { + return false; + } + + // Setup Counter Data builder + NVPW_CounterDataBuilder_Create_Params counterDataBuilderCreateParams = { + NVPW_CounterDataBuilder_Create_Params_STRUCT_SIZE, nullptr}; + counterDataBuilderCreateParams.pChipName = chipName.c_str(); + if (!NVPW_CALL( + NVPW_CounterDataBuilder_Create(&counterDataBuilderCreateParams))) { + return false; + } + + NVPW_CounterDataBuilder_Destroy_Params counterDataBuilderDestroyParams = { + NVPW_CounterDataBuilder_Destroy_Params_STRUCT_SIZE, nullptr}; + counterDataBuilderDestroyParams.pCounterDataBuilder = + counterDataBuilderCreateParams.pCounterDataBuilder; + SCOPE_EXIT([&]() { + NVPW_CounterDataBuilder_Destroy(( + NVPW_CounterDataBuilder_Destroy_Params*)&counterDataBuilderDestroyParams); + }); + + // Add metrics to counter data image prefix + NVPW_CounterDataBuilder_AddMetrics_Params addMetricsParams = { + NVPW_CounterDataBuilder_AddMetrics_Params_STRUCT_SIZE, nullptr}; + addMetricsParams.pCounterDataBuilder = + counterDataBuilderCreateParams.pCounterDataBuilder; + addMetricsParams.pRawMetricRequests = rawMetricRequests.data(); + addMetricsParams.numMetricRequests = rawMetricRequests.size(); + if (!NVPW_CALL( + NVPW_CounterDataBuilder_AddMetrics(&addMetricsParams))) { + return false; + } + + // Get image prefix size + NVPW_CounterDataBuilder_GetCounterDataPrefix_Params + getCounterDataPrefixParams = { + NVPW_CounterDataBuilder_GetCounterDataPrefix_Params_STRUCT_SIZE, nullptr}; + getCounterDataPrefixParams.pCounterDataBuilder = + counterDataBuilderCreateParams.pCounterDataBuilder; + getCounterDataPrefixParams.bytesAllocated = 0; + getCounterDataPrefixParams.pBuffer = nullptr; + if (!NVPW_CALL( + NVPW_CounterDataBuilder_GetCounterDataPrefix( + &getCounterDataPrefixParams))) { + return false; + } + + counterDataImagePrefix.resize(getCounterDataPrefixParams.bytesCopied); + + // Now write counter data image prefix + getCounterDataPrefixParams.bytesAllocated = counterDataImagePrefix.size(); + getCounterDataPrefixParams.pBuffer = counterDataImagePrefix.data(); + if (!NVPW_CALL( + NVPW_CounterDataBuilder_GetCounterDataPrefix( + &getCounterDataPrefixParams))) { + return false; + } + + return true; +} + +// ------------------------------------------------- +// Metric and Counter Evaluation Utilities +// ------------------------------------------------- + +std::string getRangeDescription( + const std::vector& counterDataImage, + int rangeIndex) { + std::vector descriptionPtrs; + + NVPW_Profiler_CounterData_GetRangeDescriptions_Params getRangeDescParams = { + NVPW_Profiler_CounterData_GetRangeDescriptions_Params_STRUCT_SIZE, nullptr}; + getRangeDescParams.pCounterDataImage = counterDataImage.data(); + getRangeDescParams.rangeIndex = rangeIndex; + + if (!NVPW_CALL( + NVPW_Profiler_CounterData_GetRangeDescriptions(&getRangeDescParams))) { + return ""; + } + + descriptionPtrs.resize(getRangeDescParams.numDescriptions); + getRangeDescParams.ppDescriptions = descriptionPtrs.data(); + + if (!NVPW_CALL( + NVPW_Profiler_CounterData_GetRangeDescriptions(&getRangeDescParams))) { + return ""; + } + + std::string rangeName; + + for (size_t i = 0; i < getRangeDescParams.numDescriptions; i++) { + if (i > 0) { + rangeName.append("/"); + } + rangeName.append(descriptionPtrs[i]); + } + return rangeName; +} + +CuptiProfilerResult evalMetricValues( + const std::string& chipName, + const std::vector& counterDataImage, + const std::vector& metricNames, + bool verbose) { + + if (!counterDataImage.size()) { + LOG(ERROR) << "Counter Data Image is empty!"; + return {}; + } + + NVPW_CUDA_MetricsContext_Create_Params metricsContextCreateParams = { + NVPW_CUDA_MetricsContext_Create_Params_STRUCT_SIZE, nullptr}; + metricsContextCreateParams.pChipName = chipName.c_str(); + if (!NVPW_CALL( + NVPW_CUDA_MetricsContext_Create(&metricsContextCreateParams))) { + return {}; + } + + NVPW_MetricsContext_Destroy_Params metricsContextDestroyParams = { + NVPW_MetricsContext_Destroy_Params_STRUCT_SIZE, nullptr}; + metricsContextDestroyParams.pMetricsContext = + metricsContextCreateParams.pMetricsContext; + SCOPE_EXIT([&]() { + NVPW_MetricsContext_Destroy( + (NVPW_MetricsContext_Destroy_Params*)&metricsContextDestroyParams); + }); + + NVPW_CounterData_GetNumRanges_Params getNumRangesParams = { + NVPW_CounterData_GetNumRanges_Params_STRUCT_SIZE, nullptr}; + getNumRangesParams.pCounterDataImage = counterDataImage.data(); + if (!NVPW_CALL( + NVPW_CounterData_GetNumRanges(&getNumRangesParams))) { + return {}; + } + + // TBD in the future support special chars in metric name + // for now these are default + const bool isolated = true; + + // API takes a 2D array of chars + std::vector metricNamePtrs; + + for (const auto& metric : metricNames) { + metricNamePtrs.push_back(metric.c_str()); + } + + CuptiProfilerResult result{ + .metricNames = metricNames}; + + for (size_t rangeIndex = 0; rangeIndex < getNumRangesParams.numRanges; + ++rangeIndex) { + + CuptiRangeMeasurement rangeData { + .rangeName = getRangeDescription(counterDataImage, rangeIndex)}; + rangeData.values.resize(metricNames.size()); + + // First set Counter data image with current range + NVPW_MetricsContext_SetCounterData_Params setCounterDataParams = { + NVPW_MetricsContext_SetCounterData_Params_STRUCT_SIZE, nullptr}; + + setCounterDataParams.pMetricsContext = + metricsContextCreateParams.pMetricsContext; + setCounterDataParams.pCounterDataImage = counterDataImage.data(); + setCounterDataParams.isolated = isolated; + setCounterDataParams.rangeIndex = rangeIndex; + + NVPW_CALL(NVPW_MetricsContext_SetCounterData(&setCounterDataParams)); + + + // Now we can evaluate GPU metrics + NVPW_MetricsContext_EvaluateToGpuValues_Params evalToGpuParams = { + NVPW_MetricsContext_EvaluateToGpuValues_Params_STRUCT_SIZE, nullptr}; + evalToGpuParams.pMetricsContext = + metricsContextCreateParams.pMetricsContext; + evalToGpuParams.numMetrics = metricNamePtrs.size(); + evalToGpuParams.ppMetricNames = metricNamePtrs.data(); + evalToGpuParams.pMetricValues = rangeData.values.data(); + + if (!NVPW_CALL(NVPW_MetricsContext_EvaluateToGpuValues(&evalToGpuParams))) { + LOG(WARNING) << "Failed to evaluate metris for range : " + << rangeData.rangeName; + continue; + } + + if (verbose) { + for (size_t i = 0; i < metricNames.size(); i++) { + LOG(INFO) << "rangeName: " << rangeData.rangeName + << "\tmetricName: " << metricNames[i] + << "\tgpuValue: " << rangeData.values[i]; + } + } + + result.rangeVals.emplace_back(std::move(rangeData)); + } + + return result; +} + +#else + +bool getProfilerConfigImage( + const std::string& /*chipName*/, + const std::vector& /*metricNames*/, + std::vector& /*configImage*/, + const uint8_t* /*counterAvailabilityImage*/) { + return false; +} + +bool getCounterDataPrefixImage( + const std::string& /*chipName*/, + const std::vector& /*metricNames*/, + std::vector& /*counterDataImagePrefix*/) { + return false; +} + +CuptiProfilerResult evalMetricValues( + const std::string& /*chipName*/, + const std::vector& /*counterDataImage*/, + const std::vector& /*metricNames*/, + bool /*verbose*/) { + return {}; +} + +#endif // cuda version > 10.00 and < 11.04 + +} // namespace nvperf +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/CuptiNvPerfMetric.h b/tb_plugins/profiling/libkineto/src/CuptiNvPerfMetric.h new file mode 100644 index 0000000000000000000000000000000000000000..d5dd1b1c1d20b066891f8be679e6d6371d4f4a9b --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/CuptiNvPerfMetric.h @@ -0,0 +1,71 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#include +#include +#include + +// TODO(T90238193) +// @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude +#include "Logger.h" + +namespace KINETO_NAMESPACE { + +struct CuptiRangeMeasurement { + std::string rangeName; + std::vector values; +}; + +struct CuptiProfilerResult { + std::vector metricNames; + // rangeName, list values + std::vector rangeVals; +}; + +/* Utilities for CUPTI and NVIDIA PerfWorks Metric API + */ + +#define NVPW_CALL(call) \ + [&]() -> bool { \ + NVPA_Status _status_ = call; \ + if (_status_ != NVPA_STATUS_SUCCESS) { \ + LOG(WARNING) << fmt::format( \ + "function {} failed with error ({})", \ + #call, \ + (int)_status_); \ + return false; \ + } \ + return true; \ + }() + +// fixme - add a results string +// nvpperfGetResultString(_status_, &_errstr_); + +namespace nvperf { + +// Setup CUPTI profiler configuration blob and counter data image prefix +bool getProfilerConfigImage( + const std::string& chipName, + const std::vector& metricNames, + std::vector& configImage, + const uint8_t* counterAvailabilityImage = nullptr); + +// Setup CUPTI profiler configuration blob and counter data image prefix +bool getCounterDataPrefixImage( + const std::string& chipName, + const std::vector& metricNames, + std::vector& counterDataImagePrefix); + +/* NV Perf Metric Evaluation helpers + * - utilities to read binary data and obtain metrics for ranges + */ +CuptiProfilerResult evalMetricValues( + const std::string& chipName, + const std::vector& counterDataImage, + const std::vector& metricNames, + bool verbose = false); + + +} // namespace nvperf +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/CuptiRangeProfilerApi.cpp b/tb_plugins/profiling/libkineto/src/CuptiRangeProfilerApi.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e5f18ed7b0b70963eb2deab126ff4f7119ed582b --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/CuptiRangeProfilerApi.cpp @@ -0,0 +1,751 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include +#include +#ifdef HAS_CUPTI +#include +#include +#endif // HAS_CUPTI +#include +#include + +#ifdef HAS_CUPTI +#include "cupti_call.h" +#endif + +#include "time_since_epoch.h" +#include "Logger.h" +#include "Demangle.h" + +// TODO(T90238193) +// @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude +#include "CuptiCallbackApiMock.h" +#include "CuptiRangeProfilerApi.h" + +#if HAS_CUPTI_RANGE_PROFILER +#include +#include +#include "cupti_call.h" +#endif // HAS_CUPTI_RANGE_PROFILER + +namespace KINETO_NAMESPACE { + +#if HAS_CUPTI_RANGE_PROFILER +constexpr char kRootUserRangeName[] = "__profile__"; +constexpr int kCallbacksCountToFlush = 500; + +// Should we set Counter availability image ourselves? +// Disabled this right now as this call conflicts with DCGM +// It is not clear why it should conflict except it being a profiler API call +// TODO Revisit +constexpr bool kSetCounterAvail = false; + +// Shared state to track one Cupti Profiler API per Device +namespace { +// per device profiler maps +std::unordered_map profiler_map; +std::unordered_map enable_flag; +std::unordered_map disable_flag; + +std::mutex contextMutex_; +std::unordered_map ctx_to_dev; +std::set active_devices; +} + +// forward declarations +void __trackCudaCtx(CUcontext ctx, uint32_t device_id, CUpti_CallbackId cbid); +void __trackCudaKernelLaunch(CUcontext ctx, const char* kernelName); + +/// Helper functions + +// Available raw counters +std::vector getCounterAvailiability(CUcontext cuContext) { + std::vector counterAvailabilityImage; + CUpti_Profiler_GetCounterAvailability_Params getCounterAvailabilityParams = { + CUpti_Profiler_GetCounterAvailability_Params_STRUCT_SIZE, nullptr}; + getCounterAvailabilityParams.ctx = cuContext; + CUPTI_CALL( + cuptiProfilerGetCounterAvailability(&getCounterAvailabilityParams)); + + counterAvailabilityImage.clear(); + counterAvailabilityImage.resize( + getCounterAvailabilityParams.counterAvailabilityImageSize); + + getCounterAvailabilityParams.pCounterAvailabilityImage = + counterAvailabilityImage.data(); + CUPTI_CALL( + cuptiProfilerGetCounterAvailability(&getCounterAvailabilityParams)); + + return counterAvailabilityImage; +} + +std::string getChipName(int deviceId) { + // Get chip name for the cuda device + CUpti_Device_GetChipName_Params getChipNameParams = { + CUpti_Device_GetChipName_Params_STRUCT_SIZE, nullptr}; + + getChipNameParams.deviceIndex = deviceId; + CUPTI_CALL(cuptiDeviceGetChipName(&getChipNameParams)); + + return getChipNameParams.pChipName; +} + +inline uint32_t getDevID(CUcontext ctx) { + uint32_t device_id = UINT32_MAX; + CUPTI_CALL(cuptiGetDeviceId(ctx, &device_id)); + if (device_id == UINT32_MAX) { + LOG(ERROR) << "Could not determine dev id for = " << ctx; + } + return device_id; +} + +// We use CUPTI Callback functions in three ways : +// 1. Track cuda contexts and maintain a list of active GPUs to profile +// 2. Callbacks on kernel launches to track the name of automatic +// ranges that correspond to names of kernels +// 3. Lastly CUPTI profiler has to be enabled on the same thread executing +// the CUDA kernels. We use Callbacks to enable the profiler +// asynchronously from another thread. + +void disableKernelCallbacks(); + +void trackCudaCtx( + CUpti_CallbackDomain /*domain*/, + CUpti_CallbackId cbid, + const CUpti_CallbackData* cbInfo) { + auto *d = reinterpret_cast(cbInfo); + auto ctx = d->context; + uint32_t device_id = getDevID(ctx); + + if (device_id == UINT32_MAX) { + return; + } + + __trackCudaCtx(ctx, device_id, cbid); +} + +void __trackCudaCtx(CUcontext ctx, uint32_t device_id, CUpti_CallbackId cbid) { + std::lock_guard g(contextMutex_); + if (cbid == CUPTI_CBID_RESOURCE_CONTEXT_CREATED) { + VLOG(0) << "CUPTI Profiler observed CUDA Context created = " + << ctx << " device id = " << device_id; + active_devices.insert(device_id); + if constexpr (kSetCounterAvail) { + if (active_devices.size() == 1) { + CuptiRBProfilerSession::setCounterAvailabilityImage( + getCounterAvailiability(ctx)); + } + } + ctx_to_dev[ctx] = device_id; + + } else if (cbid == CUPTI_CBID_RESOURCE_CONTEXT_DESTROY_STARTING) { + VLOG(0) << "CUPTI Profiler observed CUDA Context destroyed = " + << ctx << " device id = " << device_id; + auto it = active_devices.find(device_id); + if (it != active_devices.end()) { + active_devices.erase(it); + ctx_to_dev.erase(ctx); + } + } +} + +void trackCudaKernelLaunch( + CUpti_CallbackDomain /*domain*/, + CUpti_CallbackId /*cbid*/, + const CUpti_CallbackData* cbInfo) { + VLOG(1) << " Trace : Callback name = " + << (cbInfo->symbolName ? cbInfo->symbolName: "") + << " context ptr = " << cbInfo->context; + auto ctx = cbInfo->context; + // should be in CUPTI_API_ENTER call site + if (cbInfo->callbackSite != CUPTI_API_ENTER) { + return; + } + __trackCudaKernelLaunch(ctx, cbInfo->symbolName); +} + +void __trackCudaKernelLaunch( + CUcontext ctx, + const char* kernelName) { + VLOG(0) << " Tracking kernel name = " << (kernelName ? kernelName : "") + << " context ptr = " << ctx; + + uint32_t device_id = 0; + auto it = ctx_to_dev.find(ctx); + if (it == ctx_to_dev.end()) { + // Warning here could be too noisy + VLOG(0) << " Could not find corresponding device to ctx = " << ctx; + return; + } else { + device_id = it->second; + } + + auto pit = profiler_map.find(device_id); + if (pit == profiler_map.end() || pit->second == nullptr) { + return; + } + auto profiler = pit->second; + + if (enable_flag[device_id]) { + LOG(INFO) << "Callback handler is enabling cupti profiler"; + profiler->startAndEnable(); + enable_flag[device_id] = false; + + } else if (disable_flag[device_id]) { + LOG(INFO) << "Callback handler is disabling cupti profiler"; + profiler->disableAndStop(); + return; + } + + if (profiler->curRange_ == CUPTI_AutoRange) { + profiler->logKernelName(kernelName ? kernelName : "__missing__"); + } + + /* TODO add per kernel time logging + if (measure_per_kernel) { + profiler->kernelStartTs_.push_back( + std::chrono::high_resolution_clock::now()); + } + */ + + // periodically flush profiler data from GPU + if (profiler->numCallbacks_ % kCallbacksCountToFlush == 0) { + profiler->flushCounterData(); + } + profiler->numCallbacks_++; +} + +void enableKernelCallbacks() { + auto& cbapi = CuptiCallbackApi::singleton(); + bool status = cbapi.enableCallback( + CUPTI_CB_DOMAIN_RUNTIME_API, + CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000); + if (!status) { + LOG(WARNING) << "CUPTI Range Profiler unable to " + << "enable cuda kernel launch callback"; + return; + } + LOG(INFO) << "CUPTI Profiler kernel callbacks enabled"; +} + +void disableKernelCallbacks() { + auto& cbapi = CuptiCallbackApi::singleton(); + bool status = cbapi.disableCallback( + CUPTI_CB_DOMAIN_RUNTIME_API, + CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000); + if (!status) { + LOG(WARNING) << "CUPTI Range Profiler unable to " + << "disable cuda kernel launch callback"; + return; + } + LOG(INFO) << "CUPTI Profiler kernel callbacks disabled"; +} + +// static +std::set CuptiRBProfilerSession::getActiveDevices() { + std::lock_guard g(contextMutex_); + return active_devices; +} + +// static +void CuptiRBProfilerSession::initCupti() { + CUpti_Profiler_Initialize_Params profilerInitializeParams = { + CUpti_Profiler_Initialize_Params_STRUCT_SIZE, nullptr}; + CUPTI_CALL(cuptiProfilerInitialize(&profilerInitializeParams)); +} + +// static +void CuptiRBProfilerSession::deInitCupti() { + CUpti_Profiler_DeInitialize_Params profilerDeInitializeParams = { + CUpti_Profiler_DeInitialize_Params_STRUCT_SIZE, nullptr}; + CUPTI_CALL(cuptiProfilerDeInitialize(&profilerDeInitializeParams)); +} + +// static +void CuptiRBProfilerSession::staticInit() { + CuptiRBProfilerSession::initCupti(); + + // Register CUPTI callbacks + auto& cbapi = CuptiCallbackApi::singleton(); + CUpti_CallbackDomain domain = CUPTI_CB_DOMAIN_RESOURCE; + bool status = cbapi.registerCallback( + domain, CuptiCallbackApi::RESOURCE_CONTEXT_CREATED, trackCudaCtx); + status = status && cbapi.registerCallback( + domain, CuptiCallbackApi::RESOURCE_CONTEXT_DESTROYED, trackCudaCtx); + status = status && cbapi.enableCallback( + domain, CUPTI_CBID_RESOURCE_CONTEXT_CREATED); + status = status && cbapi.enableCallback( + domain, CUPTI_CBID_RESOURCE_CONTEXT_DESTROY_STARTING); + + if (!status) { + LOG(WARNING) << "CUPTI Range Profiler unable to attach cuda context " + << "create and destroy callbacks"; + CUPTI_CALL(cbapi.getCuptiStatus()); + return; + } + + domain = CUPTI_CB_DOMAIN_RUNTIME_API; + status = cbapi.registerCallback( + domain, CuptiCallbackApi::CUDA_LAUNCH_KERNEL, trackCudaKernelLaunch); + + if (!status) { + LOG(WARNING) << "CUPTI Range Profiler unable to attach cuda kernel " + << "launch callback"; + return; + } +} + +// static +std::vector& CuptiRBProfilerSession::counterAvailabilityImage() { + static std::vector counterAvailabilityImage_; + return counterAvailabilityImage_; +} + + +// Setup the profiler sessions +CuptiRBProfilerSession::CuptiRBProfilerSession( + const std::vector& metricNames, + int deviceId, + int maxRanges, + int numNestingLevels, + CUcontext cuContext) + : metricNames_(metricNames), + chipName_(getChipName(deviceId)), + deviceId_(deviceId), + maxRanges_(maxRanges), + numNestingLevels_(numNestingLevels), + cuContext_(cuContext) { + CuptiRBProfilerSession::initCupti(); + + LOG(INFO) << "Initializing CUPTI profiler session : device = " << deviceId + << " chip = " << chipName_; + /* Generate configuration for metrics, this can also be done offline*/ + NVPW_InitializeHost_Params initializeHostParams = { + NVPW_InitializeHost_Params_STRUCT_SIZE, nullptr}; + NVPW_CALL(NVPW_InitializeHost(&initializeHostParams)); + + if (metricNames.size()) { + if (!nvperf::getProfilerConfigImage( + chipName_, + metricNames, + configImage, + CuptiRBProfilerSession::counterAvailabilityImage().data())) { + LOG(ERROR) << "Failed to create configImage or counterDataImagePrefix"; + return; + } + if (!nvperf::getCounterDataPrefixImage( + chipName_, + metricNames, + counterDataImagePrefix)) { + LOG(ERROR) << "Failed to create counterDataImagePrefix"; + return; + } + } else { + LOG(ERROR) << "No metrics provided to profile"; + return; + } + + if (!createCounterDataImage()) { + LOG(ERROR) << "Failed to create counterDataImage"; + return; + } + + LOG(INFO) << "Size of structs\n" + << " config image size = " << configImage.size() << " B" + << " counter data image prefix = " + << counterDataImagePrefix.size() << " B" + << " counter data image size = " << counterDataImage.size() / 1024 + << " KB" + << " counter sb image size = " + << counterDataScratchBuffer.size() << " B"; + + beginPassParams_ = {CUpti_Profiler_BeginPass_Params_STRUCT_SIZE, nullptr}; + endPassParams_ = {CUpti_Profiler_EndPass_Params_STRUCT_SIZE, nullptr}; + + initSuccess_ = true; + profiler_map[deviceId] = this; +} + +// used in unittests only +CuptiRBProfilerSession::CuptiRBProfilerSession(int deviceId, CUcontext ctx) + : deviceId_(deviceId), cuContext_(ctx) { + initSuccess_ = true; + profiler_map[deviceId] = this; +} + +void CuptiRBProfilerSession::startInternal( + CUpti_ProfilerRange profilerRange, + CUpti_ProfilerReplayMode profilerReplayMode) { + LOG(INFO) << "Starting profiler session: profiler range = " + << ((profilerRange == CUPTI_AutoRange) ? "autorange" : "userrange") + << " replay mode = " + << ((profilerReplayMode == CUPTI_KernelReplay) ? "kernel" : "user"); + if (!initSuccess_) { + LOG(WARNING) << __func__ << "() bailing out since initialization failed"; + return; + } + + if (cuContext_ == nullptr) { + for (const auto& it : ctx_to_dev) { + if (it.second == deviceId_) { + cuContext_ = it.first; + break; + } + } + LOG(INFO) << " Cupti Profiler using CUDA context = " << cuContext_; + } + + profilerStartTs_ = std::chrono::high_resolution_clock::now(); + curRange_ = profilerRange; + curReplay_ = profilerReplayMode; + + CUpti_Profiler_BeginSession_Params beginSessionParams = { + CUpti_Profiler_BeginSession_Params_STRUCT_SIZE, nullptr}; + + beginSessionParams.ctx = cuContext_; + beginSessionParams.counterDataImageSize = counterDataImage.size(); + beginSessionParams.pCounterDataImage = counterDataImage.data(); + beginSessionParams.counterDataScratchBufferSize = + counterDataScratchBuffer.size(); + beginSessionParams.pCounterDataScratchBuffer = counterDataScratchBuffer.data(); + beginSessionParams.range = profilerRange; + beginSessionParams.replayMode = profilerReplayMode; + beginSessionParams.maxRangesPerPass = maxRanges_; + beginSessionParams.maxLaunchesPerPass = maxRanges_; + + auto status = CUPTI_CALL(cuptiProfilerBeginSession(&beginSessionParams)); + if (status != CUPTI_SUCCESS) { + LOG(WARNING) << "Failed to start CUPTI profiler"; + initSuccess_ = false; + return; + } + + // Set counter configuration + CUpti_Profiler_SetConfig_Params setConfigParams = { + CUpti_Profiler_SetConfig_Params_STRUCT_SIZE, nullptr}; + + setConfigParams.ctx = cuContext_; + setConfigParams.pConfig = configImage.data(); + setConfigParams.configSize = configImage.size(); + setConfigParams.passIndex = 0; + setConfigParams.minNestingLevel = 1; + setConfigParams.numNestingLevels = numNestingLevels_; + status = CUPTI_CALL(cuptiProfilerSetConfig(&setConfigParams)); + + if (status != CUPTI_SUCCESS) { + LOG(WARNING) << "Failed to configure CUPTI profiler"; + initSuccess_ = false; + return; + } + profilerInitDoneTs_ = std::chrono::high_resolution_clock::now(); + + if (curRange_ == CUPTI_AutoRange) { + enableKernelCallbacks(); + } + profilingActive_ = true; +} + +void CuptiRBProfilerSession::stop() { + if (!initSuccess_) { + LOG(WARNING) << __func__ << "() bailing out since initialization failed"; + return; + } + LOG(INFO) << "Stop profiler session on device = " << deviceId_; + + CUpti_Profiler_UnsetConfig_Params unsetConfigParams = { + CUpti_Profiler_UnsetConfig_Params_STRUCT_SIZE, nullptr}; + CUPTI_CALL(cuptiProfilerUnsetConfig(&unsetConfigParams)); + + CUpti_Profiler_EndSession_Params endSessionParams = { + CUpti_Profiler_EndSession_Params_STRUCT_SIZE, nullptr}; + CUPTI_CALL(cuptiProfilerEndSession(&endSessionParams)); + + disableKernelCallbacks(); + + profilerStopTs_ = std::chrono::high_resolution_clock::now(); + profilingActive_ = false; +} + +void CuptiRBProfilerSession::beginPass() { + if (!initSuccess_) { + LOG(WARNING) << __func__ << "() bailing out since initialization failed"; + return; + } + CUPTI_CALL(cuptiProfilerBeginPass(&beginPassParams_)); +} + +bool CuptiRBProfilerSession::endPass() { + if (!initSuccess_) { + LOG(WARNING) << __func__ << "() bailing out since initialization failed"; + return true; + } + CUPTI_CALL(cuptiProfilerEndPass(&endPassParams_)); + return endPassParams_.allPassesSubmitted; +} + +void CuptiRBProfilerSession::flushCounterData() { + LOG(INFO) << "Flushing counter data on device = " << deviceId_; + CUpti_Profiler_FlushCounterData_Params flushCounterDataParams = { + CUpti_Profiler_FlushCounterData_Params_STRUCT_SIZE, nullptr}; + CUPTI_CALL(cuptiProfilerFlushCounterData(&flushCounterDataParams)); +} + +/// Enable and disable the profiler +void CuptiRBProfilerSession::enable() { + if (!initSuccess_) { + LOG(WARNING) << __func__ << "() bailing out since initialization failed"; + return; + } + CUpti_Profiler_EnableProfiling_Params enableProfilingParams = { + CUpti_Profiler_EnableProfiling_Params_STRUCT_SIZE, nullptr}; + CUPTI_CALL(cuptiProfilerEnableProfiling(&enableProfilingParams)); +} + +void CuptiRBProfilerSession::disable() { + if (!initSuccess_) { + LOG(WARNING) << __func__ << "() bailing out since initialization failed"; + return; + } + CUpti_Profiler_DisableProfiling_Params disableProfilingParams = { + CUpti_Profiler_DisableProfiling_Params_STRUCT_SIZE, nullptr}; + CUPTI_CALL(cuptiProfilerDisableProfiling(&disableProfilingParams)); +} + +/// User range based profiling +void CuptiRBProfilerSession::pushRange(const std::string& rangeName) { + LOG(INFO) << " CUPTI pushrange ( " << rangeName << " )"; + CUpti_Profiler_PushRange_Params pushRangeParams = { + CUpti_Profiler_PushRange_Params_STRUCT_SIZE, nullptr}; + pushRangeParams.pRangeName = rangeName.c_str(); + CUPTI_CALL(cuptiProfilerPushRange(&pushRangeParams)); +} + +void CuptiRBProfilerSession::popRange() { + LOG(INFO) << " CUPTI pop range"; + CUpti_Profiler_PopRange_Params popRangeParams = { + CUpti_Profiler_PopRange_Params_STRUCT_SIZE, nullptr}; + CUPTI_CALL(cuptiProfilerPopRange(&popRangeParams)); +} + +void CuptiRBProfilerSession::startAndEnable() { + startInternal(curRange_, curReplay_); + if (curReplay_ == CUPTI_UserReplay) { + beginPass(); + } + enable(); + if (curRange_ == CUPTI_UserRange) { + pushRange(kRootUserRangeName); + } + enable_flag[deviceId_] = false; +} + +void CuptiRBProfilerSession::disableAndStop() { + if (curRange_ == CUPTI_UserRange) { + popRange(); + } + disable(); + if (curReplay_ == CUPTI_UserReplay) { + endPass(); + flushCounterData(); + } + stop(); + disable_flag[deviceId_] = false; +} + +void CuptiRBProfilerSession::asyncStartAndEnable( + CUpti_ProfilerRange profilerRange, + CUpti_ProfilerReplayMode profilerReplayMode) { + LOG(INFO) << "Starting CUPTI profiler asynchronously on device = " + << deviceId_ << " profiler range = " + << ((profilerRange == CUPTI_AutoRange) ? "autorange" : "userrange") + << " replay mode = " + << ((profilerReplayMode == CUPTI_KernelReplay) ? "kernel" : "user"); + curReplay_ = profilerReplayMode; + curRange_ = profilerRange; + enable_flag[deviceId_] = true; + enableKernelCallbacks(); +} + +void CuptiRBProfilerSession::asyncDisableAndStop() { + LOG(INFO) << "Stopping CUPTI profiler asynchronously on device = " + << deviceId_ << " cu context = " << cuContext_; + disable_flag[deviceId_] = true; +} + + +CuptiProfilerResult CuptiRBProfilerSession::evaluateMetrics( + bool verbose) { + if (!initSuccess_) { + LOG(WARNING) << "Profiling failed, no results to return"; + return {}; + } + if (profilingActive_) { + disableAndStop(); + } + + LOG(INFO) << "Total kernels logged = " << kernelNames_.size(); + if (verbose) { + for (const auto& kernel : kernelNames_) { + std::cout << demangle(kernel) << std::endl; + } + LOG(INFO) << "Profiler Range data : "; + } + + auto results = nvperf::evalMetricValues( + chipName_, counterDataImage, metricNames_, verbose /*verbose*/); + + // profiler end-end duration + auto duration_ms = std::chrono::duration_cast( + profilerStopTs_ - profilerStartTs_); + + auto init_dur_ms = std::chrono::duration_cast( + profilerInitDoneTs_ - profilerStartTs_); + LOG(INFO) << "Total profiler time = " << duration_ms.count() << " ms"; + LOG(INFO) << "Total profiler init time = " << init_dur_ms.count() << " ms"; + + return results; +} + +std::unique_ptr CuptiRBProfilerSession::getProfilerTraceSpan() { + return std::make_unique( + timeSinceEpoch(profilerStartTs_), + timeSinceEpoch(profilerStopTs_), + "__cupti_profiler__" + ); +} + +void CuptiRBProfilerSession::saveCounterData( + const std::string& /*CounterDataFileName*/, + const std::string& /*CounterDataSBFileName*/) { + /* TBD write binary files for counter data and counter scratch buffer */ +} + +/// Setup counter data +bool CuptiRBProfilerSession::createCounterDataImage() { + CUpti_Profiler_CounterDataImageOptions counterDataImageOptions; + counterDataImageOptions.pCounterDataPrefix = counterDataImagePrefix.data(); + counterDataImageOptions.counterDataPrefixSize = counterDataImagePrefix.size(); + counterDataImageOptions.maxNumRanges = maxRanges_; + counterDataImageOptions.maxNumRangeTreeNodes = maxRanges_; + counterDataImageOptions.maxRangeNameLength = 64; + + // Calculate size of counter data image + CUpti_Profiler_CounterDataImage_CalculateSize_Params calculateSizeParams = { + CUpti_Profiler_CounterDataImage_CalculateSize_Params_STRUCT_SIZE, nullptr}; + calculateSizeParams.pOptions = &counterDataImageOptions; + calculateSizeParams.sizeofCounterDataImageOptions = + CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE; + + CUPTI_CALL( + cuptiProfilerCounterDataImageCalculateSize(&calculateSizeParams)); + counterDataImage.resize(calculateSizeParams.counterDataImageSize); + + // Initialize counter data image + CUpti_Profiler_CounterDataImage_Initialize_Params initializeParams = { + CUpti_Profiler_CounterDataImage_Initialize_Params_STRUCT_SIZE, nullptr}; + initializeParams.sizeofCounterDataImageOptions = + CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE; + initializeParams.pOptions = &counterDataImageOptions; + initializeParams.counterDataImageSize = + calculateSizeParams.counterDataImageSize; + initializeParams.pCounterDataImage = counterDataImage.data(); + CUPTI_CALL(cuptiProfilerCounterDataImageInitialize(&initializeParams)); + + // Calculate counter Scratch Buffer size + CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params + scratchBufferSizeParams = { + CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params_STRUCT_SIZE, nullptr}; + + scratchBufferSizeParams.counterDataImageSize = + calculateSizeParams.counterDataImageSize; + scratchBufferSizeParams.pCounterDataImage = + initializeParams.pCounterDataImage; + CUPTI_CALL(cuptiProfilerCounterDataImageCalculateScratchBufferSize( + &scratchBufferSizeParams)); + + counterDataScratchBuffer.resize( + scratchBufferSizeParams.counterDataScratchBufferSize); + + // Initialize scratch buffer + CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params + initScratchBufferParams = { + CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params_STRUCT_SIZE, nullptr}; + + initScratchBufferParams.counterDataImageSize = + calculateSizeParams.counterDataImageSize; + + initScratchBufferParams.pCounterDataImage = + initializeParams.pCounterDataImage; + initScratchBufferParams.counterDataScratchBufferSize = + scratchBufferSizeParams.counterDataScratchBufferSize; + initScratchBufferParams.pCounterDataScratchBuffer = + counterDataScratchBuffer.data(); + + CUPTI_CALL(cuptiProfilerCounterDataImageInitializeScratchBuffer( + &initScratchBufferParams)); + + return true; +} + +#elif defined(HAS_CUPTI) + +// Create empty stubs for the API when CUPTI is not present. +CuptiRBProfilerSession::CuptiRBProfilerSession( + const std::vector& metricNames, + int deviceId, + int maxRanges, + int numNestingLevels, + CUcontext cuContext) + : metricNames_(metricNames), + deviceId_(deviceId), + maxRanges_(maxRanges), + numNestingLevels_(numNestingLevels), + cuContext_(cuContext) {} +void CuptiRBProfilerSession::stop() {} +void CuptiRBProfilerSession::enable() {} +void CuptiRBProfilerSession::disable() {} +void CuptiRBProfilerSession::beginPass() {} +bool CuptiRBProfilerSession::endPass() { return true; } +void CuptiRBProfilerSession::flushCounterData() {} +void CuptiRBProfilerSession::pushRange(const std::string& /*rangeName*/) {} +void CuptiRBProfilerSession::popRange() {} +void CuptiRBProfilerSession::asyncStartAndEnable( + CUpti_ProfilerRange /*profilerRange*/, + CUpti_ProfilerReplayMode /*profilerReplayMode*/) {} +void CuptiRBProfilerSession::asyncDisableAndStop() {} +CuptiProfilerResult CuptiRBProfilerSession::evaluateMetrics(bool verbose) { + static CuptiProfilerResult res; + return res; +}; +void CuptiRBProfilerSession::saveCounterData( + const std::string& /*CounterDataFileName*/, + const std::string& /*CounterDataSBFileName*/) {} +void CuptiRBProfilerSession::initCupti() {} +void CuptiRBProfilerSession::deInitCupti() {} +void CuptiRBProfilerSession::staticInit() {} +bool CuptiRBProfilerSession::createCounterDataImage() { return true; } +void CuptiRBProfilerSession::startInternal( + CUpti_ProfilerRange /*profilerRange*/, + CUpti_ProfilerReplayMode /*profilerReplayMode*/) {} +std::vector& CuptiRBProfilerSession::counterAvailabilityImage() { + static std::vector _vec; + return _vec; +} +#endif // HAS_CUPTI_RANGE_PROFILER + +namespace testing { + +void trackCudaCtx(CUcontext ctx, uint32_t device_id, CUpti_CallbackId cbid) { +#if HAS_CUPTI_RANGE_PROFILER + __trackCudaCtx(ctx, device_id, cbid); +#endif // HAS_CUPTI_RANGE_PROFILER +} + +void trackCudaKernelLaunch(CUcontext ctx, const char* kernelName) { +#if HAS_CUPTI_RANGE_PROFILER + __trackCudaKernelLaunch(ctx, kernelName); +#endif // HAS_CUPTI_RANGE_PROFILER +} + +} // namespace testing +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/CuptiRangeProfilerApi.h b/tb_plugins/profiling/libkineto/src/CuptiRangeProfilerApi.h new file mode 100644 index 0000000000000000000000000000000000000000..98a0b3ea5f4850dfa060e4e86d5ebf210692db1a --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/CuptiRangeProfilerApi.h @@ -0,0 +1,220 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#ifdef HAS_CUPTI +#include +#include +// Using CUDA 11 and above due to usage of API: cuptiProfilerGetCounterAvailability. +#if defined(CUDART_VERSION) && CUDART_VERSION >= 10000 && CUDART_VERSION < 11040 && CUDA_VERSION >= 11000 +#define HAS_CUPTI_RANGE_PROFILER 1 +#endif // CUDART_VERSION > 10.00 and < 11.04 && CUDA_VERSION >= 11.00 +#endif // HAS_CUPTI + +#if HAS_CUPTI_RANGE_PROFILER +#include +#include +#include +#else +using CUpti_ProfilerRange = enum +{ + CUPTI_AutoRange, + CUPTI_UserRange, +}; + +using CUpti_ProfilerReplayMode = enum +{ + CUPTI_KernelReplay, + CUPTI_UserReplay, +}; +#endif // HAS_CUPTI_RANGE_PROFILER + +#include +#include +#include +#include +#include + +// TODO(T90238193) +// @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude +#include "TraceSpan.h" +#include "CuptiCallbackApi.h" +#include "CuptiNvPerfMetric.h" + +/* Cupti Range based profiler session + * See : https://docs.nvidia.com/cupti/Cupti/r_main.html#r_profiler + */ + +namespace KINETO_NAMESPACE { + +class CuptiRBProfilerSession { + public: + // Initialize and configure CUPTI Profiler counters. + // - Metric names must be provided as string vector. + // - Supported values by CUPTI can be found at - + // https://docs.nvidia.com/cupti/Cupti/r_main.html#r_host_metrics_api + explicit CuptiRBProfilerSession( + const std::vector& metricNames, + int deviceId, + int maxRanges, + int numNestingLevels = 1, + CUcontext cuContext = 0); + + virtual ~CuptiRBProfilerSession() = default; + + // Start profiling session + // This function has to be called from the CPU thread running + // the CUDA context. If this is not the case asyncStartAndEnable() + // can be used + void start( + CUpti_ProfilerRange profilerRange = CUPTI_AutoRange, + CUpti_ProfilerReplayMode profilerReplayMode = CUPTI_KernelReplay) { + startInternal(profilerRange, profilerReplayMode); + } + + // Stop profiling session + virtual void stop(); + + virtual void enable(); + virtual void disable(); + + // Profiler passes + // GPU hardware has limited performance monitoring resources + // the CUPTI profiler may need to run multiple passes to collect + // data for a given range + // If we use kernel replay model the kernels are automatically replayed + // else, you can use the beginPass() and endPass() functions below + // for user to manage the replays + + // starts a profiler pass with given kernels in between + virtual void beginPass(); + + // end a profiler pass with given kernels in between + // returns true if no more passes are required + virtual bool endPass(); + + // flushes the counter data - required if you use user replay + virtual void flushCounterData(); + + // Each pass can contain multiple of ranges + // metrics configured in a pass are collected per each range-stack. + virtual void pushRange(const std::string& rangeName); + virtual void popRange(); + + // utilities for common operations + void startAndEnable(); + void disableAndStop(); + + // Async APIs : these will can be called from another thread + // outside the CUDA context being profiled + void asyncStartAndEnable( + CUpti_ProfilerRange profilerRange = CUPTI_AutoRange, + CUpti_ProfilerReplayMode profilerReplayMode = CUPTI_KernelReplay); + void asyncDisableAndStop(); + + void printMetrics() { + evaluateMetrics(true); + } + + std::unique_ptr getProfilerTraceSpan(); + + virtual CuptiProfilerResult evaluateMetrics(bool verbose = false); + + void saveCounterData( + const std::string& CounterDataFileName, + const std::string& CounterDataSBFileName); + + // This is not thread safe so please only call after + // profiling has stopped + const std::vector& getKernelNames() const { + return kernelNames_; + } + + int deviceId() const { + return deviceId_; + } + + bool profilingActive() const { + return profilingActive_; + } + + static std::set getActiveDevices(); + + static void initCupti(); + + static void deInitCupti(); + + static void staticInit(); + + static void setCounterAvailabilityImage(std::vector img) { + counterAvailabilityImage() = img; + } + protected: + CuptiRBProfilerSession(int deviceId, CUcontext ctx); + + virtual void startInternal( + CUpti_ProfilerRange profilerRange, + CUpti_ProfilerReplayMode profilerReplayMode); + + CUpti_ProfilerRange curRange_ = CUPTI_AutoRange; + CUpti_ProfilerReplayMode curReplay_ = CUPTI_KernelReplay; + + private: + + bool createCounterDataImage(); + + + // log kernel name that used with callbacks + void logKernelName(const char* kernel) { + std::lock_guard lg(kernelNamesMutex_); + kernelNames_.emplace_back(kernel); + } + + std::vector metricNames_; + std::string chipName_; + + uint32_t deviceId_ = 0; + int maxRanges_; + int numNestingLevels_; + CUcontext cuContext_; + + + // data buffers for configuration and counter data collection + std::vector counterDataImagePrefix; + std::vector configImage; + std::vector counterDataImage; + std::vector counterDataScratchBuffer; + + std::chrono::time_point profilerStartTs_; + std::chrono::time_point + profilerInitDoneTs_; + std::chrono::time_point profilerStopTs_; + + std::mutex kernelNamesMutex_; + // raw kernel names (not demangled) + std::vector kernelNames_; + + uint32_t numCallbacks_ = 0; + + static std::vector& counterAvailabilityImage(); + +#if HAS_CUPTI_RANGE_PROFILER + CUpti_Profiler_BeginPass_Params beginPassParams_; + CUpti_Profiler_EndPass_Params endPassParams_; +#endif + + bool initSuccess_ = false; + bool profilingActive_ = false; + + friend void __trackCudaKernelLaunch(CUcontext ctx, const char* kernelName); +}; + +// called directly only in unit tests +namespace testing { + +void trackCudaCtx(CUcontext ctx, uint32_t device_id, CUpti_CallbackId cbid); +void trackCudaKernelLaunch(CUcontext ctx, const char* kernelName); + +} // namespace testing + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/CuptiRangeProfilerConfig.cpp b/tb_plugins/profiling/libkineto/src/CuptiRangeProfilerConfig.cpp new file mode 100644 index 0000000000000000000000000000000000000000..04b1ad0cb3f807cf87d32bc03de0ca9b552b0063 --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/CuptiRangeProfilerConfig.cpp @@ -0,0 +1,68 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include +#include + +#include +#include + +#include +#include + +using namespace std::chrono; + +namespace KINETO_NAMESPACE { + +// number of ranges affect the size of counter data binary used by +// the CUPTI Profiler. these defaults can be tuned +constexpr int KMaxAutoRanges = 1500; // supports 1500 kernels +constexpr int KMaxUserRanges = 10; // enable upto 10 sub regions marked by user + +constexpr char kCuptiProfilerMetricsKey[] = "CUPTI_PROFILER_METRICS"; +constexpr char kCuptiProfilerPerKernelKey[] = "CUPTI_PROFILER_ENABLE_PER_KERNEL"; +constexpr char kCuptiProfilerMaxRangesKey[] = "CUPTI_PROFILER_MAX_RANGES"; + +CuptiRangeProfilerConfig::CuptiRangeProfilerConfig(Config& cfg) + : parent_(&cfg), + cuptiProfilerPerKernel_(false), + cuptiProfilerMaxRanges_(0) {} + +bool CuptiRangeProfilerConfig::handleOption(const std::string& name, std::string& val) { + VLOG(0) << " handling : " << name << " = " << val; + // Cupti Range based Profiler configuration + if (!name.compare(kCuptiProfilerMetricsKey)) { + activitiesCuptiMetrics_ = splitAndTrim(val, ','); + } else if (!name.compare(kCuptiProfilerPerKernelKey)) { + cuptiProfilerPerKernel_ = toBool(val); + } else if (!name.compare(kCuptiProfilerMaxRangesKey)) { + cuptiProfilerMaxRanges_ = toInt64(val); + } else { + return false; + } + return true; +} + +void CuptiRangeProfilerConfig::setDefaults() { + if (activitiesCuptiMetrics_.size() > 0 && cuptiProfilerMaxRanges_ == 0) { + cuptiProfilerMaxRanges_ = + cuptiProfilerPerKernel_ ? KMaxAutoRanges : KMaxUserRanges; + } +} + +void CuptiRangeProfilerConfig::printActivityProfilerConfig(std::ostream& s) const { + if (activitiesCuptiMetrics_.size() > 0) { + s << "Cupti Profiler metrics : " + << fmt::format("{}", fmt::join(activitiesCuptiMetrics_, ", ")) << std::endl; + s << "Cupti Profiler measure per kernel : " + << cuptiProfilerPerKernel_ << std::endl; + s << "Cupti Profiler max ranges : " << cuptiProfilerMaxRanges_ << std::endl; + } +} + +void CuptiRangeProfilerConfig::registerFactory() { + Config::addConfigFactory( + kCuptiProfilerConfigName, + [](Config& cfg) { return new CuptiRangeProfilerConfig(cfg); }); +} + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/CuptiRangeProfilerConfig.h b/tb_plugins/profiling/libkineto/src/CuptiRangeProfilerConfig.h new file mode 100644 index 0000000000000000000000000000000000000000..549b8a4e8b40c66b59bae974eb87c7f64967344e --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/CuptiRangeProfilerConfig.h @@ -0,0 +1,86 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#include "Config.h" + +#include +#include +#include +#include + +namespace KINETO_NAMESPACE { + +constexpr char kCuptiProfilerConfigName[] = "cupti_rb_profiler"; + +class CuptiRangeProfilerConfig : public AbstractConfig { + public: + bool handleOption(const std::string& name, std::string& val) override; + + void validate( + const std::chrono::time_point& + fallbackProfileStartTime) override {} + + static CuptiRangeProfilerConfig& get(const Config& cfg) { + return dynamic_cast(cfg.feature( + kCuptiProfilerConfigName)); + } + + Config& parent() const { + return *parent_; + } + + std::vector activitiesCuptiMetrics() const { + return activitiesCuptiMetrics_; + } + + bool cuptiProfilerPerKernel() const { + return cuptiProfilerPerKernel_; + } + + int64_t cuptiProfilerMaxRanges() const { + return cuptiProfilerMaxRanges_; + } + + void setSignalDefaults() override { + setDefaults(); + } + + void setClientDefaults() override { + setDefaults(); + } + + void printActivityProfilerConfig(std::ostream& s) const override; + + static void registerFactory(); + protected: + AbstractConfig* cloneDerived(AbstractConfig& parent) const override { + CuptiRangeProfilerConfig* clone = new CuptiRangeProfilerConfig(*this); + clone->parent_ = dynamic_cast(&parent); + return clone; + } + + private: + CuptiRangeProfilerConfig() = delete; + explicit CuptiRangeProfilerConfig(Config& parent); + explicit CuptiRangeProfilerConfig( + const CuptiRangeProfilerConfig& other) = default; + + // some defaults will depend on other configuration + void setDefaults(); + + // Associated Config object + Config* parent_; + + // Counter metrics exposed via CUPTI Profiler API + std::vector activitiesCuptiMetrics_; + + // Collect profiler metrics per kernel - autorange made + bool cuptiProfilerPerKernel_{false}; + + // max number of ranges to configure the profiler for. + // this has to be set before hand to reserve space for the output + int64_t cuptiProfilerMaxRanges_ = 0; +}; + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/DaemonConfigLoader.h b/tb_plugins/profiling/libkineto/src/DaemonConfigLoader.h new file mode 100644 index 0000000000000000000000000000000000000000..9b0ed92863648824a57ce8193ddc16d7cf23622e --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/DaemonConfigLoader.h @@ -0,0 +1,27 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#include +#include + +namespace KINETO_NAMESPACE { + +class DaemonConfigLoader { + public: + virtual ~DaemonConfigLoader() {} + + // Return the base config from the daemon + virtual std::string readBaseConfig() = 0; + + // Return a configuration string from the daemon, if one has been posted. + virtual std::string readOnDemandConfig(bool events, bool activities) = 0; + + // Returns the number of tracked contexts for this device. The daemon has a + // global view. If an unexpedted error occurs, return -1. + virtual int gpuContextCount(uint32_t device) = 0; + + virtual void setCommunicationFabric(bool enabled) = 0; +}; + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/Demangle.cpp b/tb_plugins/profiling/libkineto/src/Demangle.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f84f0b8ec36f621061cb1e8bb8dd948cb8aed7b3 --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/Demangle.cpp @@ -0,0 +1,49 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include "Demangle.h" + +#ifndef _MSC_VER +#include +#endif +#include +#include + +namespace KINETO_NAMESPACE { + +static constexpr int kMaxSymbolSize = 1024; + +std::string demangle(const char* name) { +#ifndef _MSC_VER + if (!name) { + return ""; + } + + if (strlen(name) > kMaxSymbolSize) { + return name; + } + + int status; + size_t len = 0; + char* demangled = abi::__cxa_demangle(name, nullptr, &len, &status); + if (status != 0) { + return name; + } + std::string res(demangled); + // The returned buffer must be freed! + free(demangled); + return res; +#else + // TODO: demangling on Windows + if (!name) { + return ""; + } else { + return name; + } +#endif +} + +std::string demangle(const std::string& name) { + return demangle(name.c_str()); +} + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/Demangle.h b/tb_plugins/profiling/libkineto/src/Demangle.h new file mode 100644 index 0000000000000000000000000000000000000000..6dcf0776f1abf30e7e3614272fa02f6bae1bdf35 --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/Demangle.h @@ -0,0 +1,12 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#include + +namespace KINETO_NAMESPACE { + +std::string demangle(const char* name); +std::string demangle(const std::string& name); + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/EventProfiler.cpp b/tb_plugins/profiling/libkineto/src/EventProfiler.cpp new file mode 100644 index 0000000000000000000000000000000000000000..dbf2755238974392ff6205f05a5c80a1733bf2ee --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/EventProfiler.cpp @@ -0,0 +1,635 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include "EventProfiler.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "CuptiEventApi.h" +#include "Logger.h" + +using namespace std::chrono; +using std::accumulate; +using std::endl; +using std::map; +using std::ostream; +using std::string; +using std::unique_ptr; +using std::vector; + +namespace KINETO_NAMESPACE { + +static std::mutex& logMutex() { + static std::mutex instance; + return instance; +} + +// --------------------------------------------------------------------- +// class Event +// --------------------------------------------------------------------- + +// Compute domain instance percentiles +PercentileList& Event::percentiles( + PercentileList& pcs, + const SampleSlice& slice) const { + vector instance_values; + instance_values.reserve(instanceCount); + for (int i = 0; i < instanceCount; i++) { + instance_values.push_back(sumInstance(i, slice)); + } + return KINETO_NAMESPACE::percentiles(instance_values, pcs); +} + +// Add up all samples for a given domain instance +int64_t Event::sumInstance(int i, const SampleSlice& slice) const { + auto r = toIdxRange(slice); + auto start = samples_.cbegin(); + std::advance(start, r.first); + auto end = start; + std::advance(end, r.second); + return accumulate(start, end, 0ul, [i](int64_t a, const Sample& b) { + return a + b.second[i]; + }); +} + +// Add up all samples across all domain instances +int64_t Event::sumAll(const SampleSlice& slice) const { + int64_t res = 0; + for (int i = 0; i < instanceCount; i++) { + res += sumInstance(i, slice); + } + return res; +} + +// Print raw sample values for all domains +void Event::printSamples(ostream& s, CUdevice device) const { + // Don't mess up output with interleaved lines + // Probably OK to reuse logMutex() here since this is + // used for debugging, but need to keep an eye on it. + std::lock_guard lock(logMutex()); + s << "Device " << device << " " << name << ":" << endl; + for (const auto& sample : samples_) { + const auto& vals = sample.second; + for (int64_t val : vals) { + s << val << " "; + } + s << endl; + } +} + +// --------------------------------------------------------------------- +// class Metric +// --------------------------------------------------------------------- +Metric::Metric( + string name, + CUpti_MetricID id, + vector events, + CUpti_MetricEvaluationMode eval_mode, + CuptiMetricApi& cupti_metrics) + : name(std::move(name)), + id_(id), + events_(std::move(events)), + evalMode_(eval_mode), + cuptiMetrics_(cupti_metrics), + valueKind_(cuptiMetrics_.valueKind(id)) {} + +// Return per-SM vector as well as total +struct Metric::CalculatedValues Metric::calculate( + map& event_map, + nanoseconds sample_duration, + const SampleSlice& slice) { + vector metric_values; + vector ev_values; + ev_values.reserve(events_.size()); + if (evalMode_ & CUPTI_METRIC_EVALUATION_MODE_PER_INSTANCE) { + int instance_count = instanceCount(event_map); + metric_values.reserve(instance_count); + for (int i = 0; i < instance_count; i++) { + ev_values.clear(); + for (CUpti_EventID event_id : events_) { + ev_values.push_back(event_map[event_id].sumInstance(i, slice)); + } + metric_values.push_back(cuptiMetrics_.calculate( + id_, valueKind_, events_, ev_values, sample_duration.count())); + } + } + + // FIXME: Check assumption that all instances are profiled + ev_values.clear(); + for (CUpti_EventID event_id : events_) { + ev_values.push_back(event_map[event_id].sumAll(slice)); + } + SampleValue total = cuptiMetrics_.calculate( + id_, valueKind_, events_, ev_values, sample_duration.count()); + if (evalMode_ & CUPTI_METRIC_EVALUATION_MODE_AGGREGATE) { + metric_values.push_back(total); + } + return {metric_values, std::move(total)}; +} + +void Metric::printDescription(ostream& s) const { + s << fmt::format("{} ({})", name, fmt::join(events_, ",")) << endl; +} + +// --------------------------------------------------------------------- +// class EventGroupSet +// --------------------------------------------------------------------- + +// Each domain has a set of counters. +// Some counters in a domain can be collected simultaneously in a "group" +// Counters from different domains can also be collected at the same time +// Therefore we have a "set of groups", or group set, with counters that +// can all be collected at once. +EventGroupSet::EventGroupSet( + CUpti_EventGroupSet& set, + map& events, + CuptiEventApi& cupti) + : set_(set), events_(events), cuptiEvents_(cupti), enabled_(false) { + for (int g = 0; g < set.numEventGroups; g++) { + CUpti_EventGroup grp = set.eventGroups[g]; + // Profile all domain instances + cuptiEvents_.enablePerInstance(grp); + uint32_t instance_count = cuptiEvents_.instanceCount(grp); + for (const auto& id : cuptiEvents_.eventsInGroup(grp)) { + VLOG(0) << "Instance count for " << id << ":" << instance_count; + events_[id].instanceCount = instance_count; + } + } +} + +EventGroupSet::~EventGroupSet() { + // Disable EventGroupSet in Cupti. + if (enabled_) { + setEnabled(false); + } +} + +// Enable or disable this group set +void EventGroupSet::setEnabled(bool enabled) { + if (enabled && !enabled_) { + cuptiEvents_.enableGroupSet(set_); + } else if (!enabled && enabled_) { + cuptiEvents_.disableGroupSet(set_); + } + enabled_ = enabled; +} + +// Collect counter values for each counter in group set +void EventGroupSet::collectSample() { + auto timestamp = system_clock::now(); + for (int g = 0; g < set_.numEventGroups; g++) { + CUpti_EventGroup grp = set_.eventGroups[g]; + for (const auto& id : cuptiEvents_.eventsInGroup(grp)) { + Event& ev = events_[id]; + vector vals(ev.instanceCount); + // FIXME: Use cuptiEventGroupReadAllEvents + cuptiEvents_.readEvent(grp, id, vals); + + if (VLOG_IS_ON(0)) { + for (int64_t v : vals) { + if (v == CUPTI_EVENT_OVERFLOW) { + LOG(WARNING) << "Counter overflow detected " + << "- decrease sample period!" << endl; + } + } + } + + ev.addSample(timestamp, vals); + } + } + + if (VLOG_IS_ON(1)) { + auto t2 = system_clock::now(); + VLOG(1) << "Device " << cuptiEvents_.device() << " Sample (us): " + << duration_cast(t2 - timestamp).count(); + } +} + +// Print names of events in this group set, ordered by group +void EventGroupSet::printDescription(ostream& s) const { + for (int g = 0; g < set_.numEventGroups; g++) { + s << " Events in group " << g << ": "; + for (const auto& id : cuptiEvents_.eventsInGroup(set_.eventGroups[g])) { + s << id << " (" << events_[id].name << ") "; + } + s << endl; + } +} + +// --------------------------------------------------------------------- +// class EventProfiler +// --------------------------------------------------------------------- + +// Find nearest factor of a number by linear search, +// starting at hi and lo - hi searches up and lo searches down +static int nearestFactor(int hi, int lo, int number) { + return number % hi == 0 + ? hi + : number % lo == 0 ? lo : nearestFactor(hi + 1, lo - 1, number); +} + +static int nearestFactor(int count, int max) { + return nearestFactor(count, count, max); +} + +void EventProfiler::initEvents(const std::set& eventNames) { + events_.clear(); + // Build event map + for (const auto& name : eventNames) { + events_.emplace(cuptiEvents_->eventId(name), name); + } +} + +void EventProfiler::initMetrics(const std::set& metricNames) { + metrics_.clear(); + // Add events from metrics + metrics_.reserve(metricNames.size()); + for (const auto& metric_name : metricNames) { + CUpti_MetricID metric_id = cuptiMetrics_->idFromName(metric_name); + if (metric_id == ~0) { + continue; + } + + const auto& events = cuptiMetrics_->events(metric_id); + vector event_ids; + event_ids.reserve(events.size()); + for (const auto& pair : events) { + CUpti_EventID id = pair.first; + const string& event_name = pair.second; + if (event_name.empty()) { + // For unnamed events, use metric name and event id + // FIXME: For subsequent metrics using the same event, + // this will be confusing + events_.emplace(id, metric_name + "_" + event_name); + } else { + events_.emplace(id, event_name); + } + event_ids.push_back(id); + } + metrics_.emplace_back( + metric_name, + metric_id, + event_ids, + cuptiMetrics_->evaluationMode(metric_id), + *cuptiMetrics_); + } +} + +bool EventProfiler::initEventGroups() { + sets_.clear(); + if (eventGroupSets_) { + cuptiEvents_->destroyGroupSets(eventGroupSets_); + eventGroupSets_ = nullptr; + } + if (events_.empty()) { + return true; + } + + // Determine sets of groups to be collected + vector ids; + ids.reserve(events_.size()); + for (const auto& ev : events_) { + ids.push_back(ev.first); + } + eventGroupSets_ = cuptiEvents_->createGroupSets(ids); + VLOG(0) << "Number of group sets: " << eventGroupSets_->numSets; + for (int i = 0; i < eventGroupSets_->numSets; i++) { + sets_.push_back( + EventGroupSet(eventGroupSets_->sets[i], events_, *cuptiEvents_)); + } + return !sets_.empty(); +} + +static unique_ptr alignAndValidateConfigs( + Config& base, + Config* onDemand) { + auto now = system_clock::now(); + if (!onDemand || + now > + (onDemand->eventProfilerOnDemandStartTime() + + onDemand->eventProfilerOnDemandDuration())) { + base.validate(now); + return base.clone(); + } + + auto res = base.clone(); + res->addEvents(onDemand->eventNames()); + res->addMetrics(onDemand->metricNames()); + + int sample_period = + std::min(base.samplePeriod().count(), onDemand->samplePeriod().count()); + if (sample_period < base.samplePeriod().count() && + (base.samplePeriod().count() % sample_period) != 0) { + sample_period = nearestFactor(sample_period, base.samplePeriod().count()); + LOG(WARNING) + << "On-demand sample period must be a factor of base sample period. " + << "Adjusting from " << onDemand->samplePeriod().count() << "ms to " + << sample_period << "ms."; + } + base.setSamplePeriod(milliseconds(sample_period)); + base.validate(now); + res->setSamplePeriod(base.samplePeriod()); + res->setMultiplexPeriod(base.multiplexPeriod()); + res->validate(now); + onDemand->setSamplePeriod(base.samplePeriod()); + onDemand->setMultiplexPeriod(base.multiplexPeriod()); + onDemand->validate(now); + + return res; +} + +static milliseconds minReportPeriod(const Config& config, int num_sets) { + return config.multiplexPeriod() * num_sets; +} + +static bool canSupportReportPeriod(const Config& config, int num_sets) { + // Can we get through the groups an even number per report period? + milliseconds min_report_period = minReportPeriod(config, num_sets); + return (config.reportPeriod().count() % min_report_period.count()) == 0; +} + +static int completeSamplesPerReport(const Config& config, int num_sets) { + if (num_sets <= 1) { + return config.reportPeriod() / config.samplePeriod(); + } + // Numnber of complete sample collections in the report period + // E.g. if report period is 10000ms, sample period 500ms, + // multiplex period 2000ms and num_sets is 5 then # of complete samples is + // (2000ms / 500ms) * (10000ms / 2000ms / 5) = 4 * 1 = 4 + int samples_per_multiplex_period = + config.multiplexPeriod() / config.samplePeriod(); + int multiplex_periods_per_report = + config.reportPeriod() / config.multiplexPeriod(); + return (multiplex_periods_per_report / num_sets) * + samples_per_multiplex_period; +} + +static bool canSupportSamplesPerReport(const Config& config, int num_sets) { + // Can samples per report can be honored with an exact *full* set of samples? + // We don't support partial samples at this point. + int full_samples_per_report = completeSamplesPerReport(config, num_sets); + return (full_samples_per_report % config.samplesPerReport()) == 0; +} + +static void adjustConfig(Config& config, int num_sets) { + // Don't change sample period and multiplex period here, since that can + // cause overflows and perf degradation. Report period and samples per + // report is OK to change (with warning). + if (!canSupportReportPeriod(config, num_sets)) { + milliseconds min_report_period = minReportPeriod(config, num_sets); + LOG(WARNING) << "Report period must be a multiple of " + << min_report_period.count() << "ms (" << num_sets + << " event sets * " << config.multiplexPeriod().count() + << "ms multiplex period), in order to get complete samples."; + auto new_report_period = + Config::alignUp(config.reportPeriod(), min_report_period); + double sf = + ((double)new_report_period.count()) / config.reportPeriod().count(); + int new_samples_per_report = std::round(config.samplesPerReport() * sf); + LOG(WARNING) << "Adjusting report period from " + << config.reportPeriod().count() << "ms to " + << new_report_period.count() << "ms"; + if (new_samples_per_report != config.samplesPerReport()) { + LOG(WARNING) << "Adjusting samples per report from " + << config.samplesPerReport() << " to " + << new_samples_per_report; + } + config.setReportPeriod(new_report_period); + config.setSamplesPerReport(new_samples_per_report); + } + // Ensure that samples per report can be honored with + // an exact *full* set of samples. Don't support partial + // samples at this point. + if (!canSupportSamplesPerReport(config, num_sets)) { + int full_samples_per_report = completeSamplesPerReport(config, num_sets); + int adjusted_count = + nearestFactor(config.samplesPerReport(), full_samples_per_report); + LOG(WARNING) + << "Samples per report must be such that an even number of " + << "complete samples can be aggregated in each report period. Adjusting" + << " from " << config.samplesPerReport() << " to " << adjusted_count + << " (complete sample count is " << full_samples_per_report << ")"; + config.setSamplesPerReport(adjusted_count); + } +} + +// Prepare profiler +EventProfiler::EventProfiler( + std::unique_ptr cupti_events, + std::unique_ptr cupti_metrics, + vector>& loggers, + vector>& onDemandLoggers) + : cuptiEvents_(std::move(cupti_events)), + cuptiMetrics_(std::move(cupti_metrics)), + loggers_(loggers), + onDemandLoggers_(onDemandLoggers) {} + +void EventProfiler::reportSamples() { + dispatchSamples(*config_, loggers_, baseSamples_); + baseSamples_ += completeSamplesPerReport(*config_, sets_.size()); +} + +void EventProfiler::reportOnDemandSamples() { + dispatchSamples(*onDemandConfig_, onDemandLoggers_, onDemandSamples_); + onDemandSamples_ += completeSamplesPerReport(*onDemandConfig_, sets_.size()); +} + +EventProfiler::~EventProfiler() { + if (eventGroupSets_) { + for (auto& set : sets_) { + set.setEnabled(false); + } + cuptiEvents_->destroyGroupSets(eventGroupSets_); + } + VLOG(0) << "Stopped event profiler for device " << device(); +} + +void EventProfiler::updateLoggers(Config& config, Config* on_demand_config) { + // Update loggers. + for (auto& logger : loggers_) { + std::lock_guard lock(logMutex()); + logger->update(config); + } + + if (on_demand_config) { + // Update onDemand loggers. + for (auto& logger : onDemandLoggers_) { + std::lock_guard lock(logMutex()); + logger->update(*on_demand_config); + } + } +} + +bool EventProfiler::applyConfig(const Config& config) { + // Initialize events, metrics, and event group sets. + // TODO: Send warnings / errors back to dyno for onDemand config + try { + if (!initEventsAndMetrics(config)) { + return false; + } + } catch (const std::exception& ex) { + LOG(WARNING) << "Failed to apply config (" << ex.what() << ")"; + return false; + } + + return true; +} + +bool EventProfiler::initEventsAndMetrics(const Config& config) { + initEvents(config.eventNames()); + initMetrics(config.metricNames()); + // We now have the total list of events to collect + // They need to be organized into groups for multiplexing + if (!initEventGroups()) { + LOG(WARNING) << "No events/metrics initialized successfully"; + return false; + } + + if (VLOG_IS_ON(1)) { + printMetrics(LIBKINETO_DBG_STREAM); + printSets(LIBKINETO_DBG_STREAM); + } + return true; +} + +void EventProfiler::printSets(ostream& s) const { + for (int i = 0; i < sets_.size(); i++) { + s << "Set " << i << endl; + sets_[i].printDescription(s); + } +} + +void EventProfiler::printMetrics(ostream& s) const { + s << "Metrics:" << endl; + for (const Metric& m : metrics_) { + m.printDescription(s); + } +} + +void EventProfiler::printAllSamples(ostream& s, CUdevice device) const { + for (const auto& pair : events_) { + const Event& ev = pair.second; + ev.printSamples(s, device); + } +} + +void EventProfiler::enableNextCounterSet() { + if (sets_.size() > 1) { + auto t1 = system_clock::now(); + + VLOG(1) << "Disabling set " << curEnabledSet_; + sets_[curEnabledSet_].setEnabled(false); + curEnabledSet_ = (curEnabledSet_ + 1) % sets_.size(); + VLOG(1) << "Enabling set " << curEnabledSet_; + sets_[curEnabledSet_].setEnabled(true); + + if (VLOG_IS_ON(1)) { + auto t2 = system_clock::now(); + VLOG(1) << "Switch (us): " + << duration_cast(t2 - t1).count(); + } + } +} + +// Notify listeners of collected samples +void EventProfiler::dispatchSamples( + const Config& config, + const vector>& loggers, + int sample_offset) { + Sample sample(events_.size() + metrics_.size()); + // Normalize values to per second + auto delta = config.reportPeriod() / config.samplesPerReport(); + double sf = 1000.0 * sets_.size() / delta.count(); + for (int i = 0; i < config.samplesPerReport(); i++) { + sample.stats.clear(); + sample.deltaMsec = (delta * i).count(); + SampleSlice slice = {sample_offset, i, config.samplesPerReport()}; + VLOG(1) << "Slice: " << sample_offset << ", " << i << ", " + << config.samplesPerReport(); + for (const auto& pair : events_) { + const Event& ev = pair.second; + int64_t total = std::round(sf * ev.sumAll(slice)); + PercentileList pcs = initPercentiles(config.percentiles()); + normalize(ev.percentiles(pcs, slice), sf); + sample.stats.push_back({ev.name, std::move(pcs), SampleValue(total)}); + } + + for (auto& m : metrics_) { + // calculate returns a pair of per-SM vector and a total + auto vals = m.calculate(events_, delta, slice); + PercentileList pcs = initPercentiles(config.percentiles()); + sample.stats.push_back( + {m.name, std::move(percentiles(vals.perInstance, pcs)), vals.total}); + } + + for (auto& logger : loggers) { + std::lock_guard lock(logMutex()); + logger->handleSample(device(), sample, config.ipcFabricEnabled()); + } + } + + if (VLOG_IS_ON(2)) { + printAllSamples(LIBKINETO_DBG_STREAM, device()); + } +} + +void EventProfiler::configure(Config& config, Config* onDemandConfig) { + if (!sets_.empty()) { + sets_[curEnabledSet_].setEnabled(false); + clearSamples(); + } + + config_ = config.clone(); + onDemandConfig_ = onDemandConfig ? onDemandConfig->clone() : nullptr; + mergedConfig_ = alignAndValidateConfigs(*config_, onDemandConfig_.get()); + if (!applyConfig(*mergedConfig_)) { + LOG(WARNING) << "Failed to apply config!"; + mergedConfig_ = config_->clone(); + applyConfig(*config_); + } + if (!sets_.empty()) { + // Make timing adjustments based on multiplexing requirements. + adjustConfig(*config_, sets_.size()); + if (onDemandConfig_) { + int duration = onDemandConfig_->eventProfilerOnDemandDuration().count(); + LOG(INFO) << "On demand profiler activated for " << duration << " secs"; + adjustConfig(*onDemandConfig_, sets_.size()); + } + // If events or metrics were added or removed, need to tell loggers + updateLoggers(*config_, onDemandConfig_.get()); + } + + curEnabledSet_ = 0; + if (!sets_.empty()) { + sets_[0].setEnabled(true); + } else { + VLOG(0) << "No counters profiled!"; + } + + baseSamples_ = 0; + onDemandSamples_ = 0; +} + +void EventProfiler::collectSample() { + if (sets_.empty()) { + return; + } + sets_[curEnabledSet_].collectSample(); + if (VLOG_IS_ON(1)) { + printAllSamples(LIBKINETO_DBG_STREAM, device()); + } +} + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/EventProfiler.h b/tb_plugins/profiling/libkineto/src/EventProfiler.h new file mode 100644 index 0000000000000000000000000000000000000000..fafd5b9bb8336b28b210ba58d588d3a798a73969 --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/EventProfiler.h @@ -0,0 +1,341 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "Config.h" +#include "CuptiEventApi.h" +#include "CuptiMetricApi.h" +#include "SampleListener.h" + +namespace KINETO_NAMESPACE { + +// Helper function for computing percentiles (nearest-rank). +// Modifies the input. +template +inline PercentileList& percentiles(std::vector values, PercentileList& pcs) { + auto size = values.size(); + for (auto& x : pcs) { + int idx = std::min(size - 1, (x.first * size) / 100); + std::nth_element(values.begin(), values.begin() + idx, values.end()); + x.second = SampleValue(values[idx]); + } + return pcs; +} + +// Helper function for normalizing a percentile list +// Modifies the input +inline PercentileList& normalize(PercentileList& pcs, double sf) { + for (auto& pc : pcs) { + pc.second *= sf; + } + return pcs; +} + +// A slice of the sample buffer +struct SampleSlice { + // Start offset (samples) + int offset; + // Slice number + int index; + // Out of this many + int count; +}; + +// A sampled event +class Event { + public: + /* implicit */ Event(std::string name) : name(std::move(name)) {} + /* implicit */ Event(const char* name) : name(name) {} + Event() : name("INVALID") {} + + Event(const Event&) = delete; + Event& operator=(const Event&) = delete; + Event(Event&&) = default; + Event& operator=(Event&&) = default; + + void addSample( + std::chrono::time_point timestamp, + const std::vector& values) { + assert(values.size() == instanceCount); + samples_.emplace_back(timestamp, values); + } + + // Sum samples for a single domain instance + int64_t sumInstance(int i, const SampleSlice& slice) const; + + // Sum all samples across all domain instances + int64_t sumAll(const SampleSlice& slice) const; + + // Create list of percentiles + PercentileList& percentiles(PercentileList& pcs, const SampleSlice& slice) + const; + + void eraseSamples(int count) { + auto end = samples_.begin(); + std::advance(end, count); + samples_.erase(samples_.begin(), end); + } + + void clearSamples() { + samples_.clear(); + } + + int sampleCount() { + return samples_.size(); + } + + void printSamples(std::ostream& s, CUdevice device) const; + + // Event name (see nvprof --query-events) + std::string name; + + // Number of domain instances for this event, e.g. number of SMs + int instanceCount = 0; + + private: + std::pair toIdxRange(const SampleSlice& slice) const { + int size = (samples_.size() - slice.offset) / slice.count; + return std::make_pair(slice.offset + (slice.index * size), size); + } + + // List of collected samples, where each sample has values for + // one or more domain instances + using Sample = std::pair< + std::chrono::time_point, + std::vector>; + std::list samples_; +}; + +class Metric { + public: + Metric( + std::string name, + CUpti_MetricID id, + std::vector events, + CUpti_MetricEvaluationMode eval_mode, + CuptiMetricApi& cupti_metrics); + + struct CalculatedValues { + std::vector perInstance; + SampleValue total; + }; + + struct CalculatedValues calculate( + std::map& events, + std::chrono::nanoseconds sample_duration, + const SampleSlice& slice); + + int instanceCount(std::map& events) { + return events[events_[0]].instanceCount; + } + + void printDescription(std::ostream& s) const; + + std::string name; + + private: + CUpti_MetricID id_; + std::vector events_; + CUpti_MetricEvaluationMode evalMode_; + // Calls to CUPTI is encapsulated behind this interface + CuptiMetricApi& cuptiMetrics_; + CUpti_MetricValueKind valueKind_; +}; + +/** + * A set of event groups. + * Holds all the events that may be collected in a single pass. + * A group contains one or more counters for a single domain. + * A group set contains zero or one groups per domain. + */ +class EventGroupSet { + public: + EventGroupSet( + CUpti_EventGroupSet& set, + std::map& events, + CuptiEventApi& cupti); + ~EventGroupSet(); + + EventGroupSet(const EventGroupSet&) = delete; + EventGroupSet& operator=(const EventGroupSet&) = delete; + EventGroupSet(EventGroupSet&&) = default; + EventGroupSet& operator=(EventGroupSet&&) = delete; + + // Number of groups = number of domains profiled + int groupCount() const { + return set_.numEventGroups; + } + + void setEnabled(bool enabled); + // Take a sample of counters in this group set + void collectSample(); + void printDescription(std::ostream& s) const; + + private: + CUpti_EventGroupSet& set_; + std::map& events_; + // Calls to CUPTI is encapsulated behind this interface + CuptiEventApi& cuptiEvents_; + bool enabled_; +}; + +// The sampler +class EventProfiler { + public: + explicit EventProfiler( + std::unique_ptr cupti_events, + std::unique_ptr cupti_metrics, + std::vector>& loggers, + std::vector>& onDemandLoggers); + EventProfiler(const EventProfiler&) = delete; + EventProfiler& operator=(const EventProfiler&) = delete; + ~EventProfiler(); + + void configure(Config& config, Config* onDemandConfig); + + bool isOnDemandActive() { + return !!onDemandConfig_; + } + + // Print the counter sets. Multiple sets will be multiplexed. + void printSets(std::ostream& s) const; + + // Print metrics descriptions + void printMetrics(std::ostream& s) const; + + bool enableForDevice(Config& cfg); + + CUdevice device() { + return cuptiEvents_->device(); + } + + bool setContinuousMode() { + return cuptiEvents_->setContinuousMode(); + } + + std::chrono::milliseconds samplePeriod() { + return mergedConfig_->samplePeriod(); + } + + std::chrono::milliseconds multiplexPeriod() { + return mergedConfig_->multiplexPeriod(); + } + + std::chrono::milliseconds reportPeriod() { + return config_->reportPeriod(); + } + + std::chrono::milliseconds onDemandReportPeriod() { + return onDemandConfig_->reportPeriod(); + } + + // Read values of currently running counters. + void collectSample(); + + void reportSamples(); + void reportOnDemandSamples(); + + bool enabled() { + return sets_.size() > 0; + } + + bool multiplexEnabled() { + return sets_.size() > 1; + } + + // Multiplex counters. + void enableNextCounterSet(); + + void eraseReportedSamples() { + int erase_count = baseSamples_; + if (onDemandConfig_ && + onDemandConfig_->eventProfilerOnDemandDuration().count() > 0) { + erase_count = std::min(baseSamples_, onDemandSamples_); + } + eraseSamples(erase_count); + baseSamples_ -= erase_count; + onDemandSamples_ -= erase_count; + } + + void clearSamples() { + for (auto& pair : events_) { + pair.second.clearSamples(); + } + baseSamples_ = 0; + onDemandSamples_ = 0; + } + + private: + // Functions to initialize profiler based on Config settings. + bool applyConfig(const Config& config); + bool initEventsAndMetrics(const Config& config); + void initEvents(const std::set& eventNames); + void initMetrics(const std::set& metricNames); + bool initEventGroups(); + + PercentileList initPercentiles(const std::vector& percentiles) { + PercentileList res; + res.reserve(percentiles.size()); + for (int p : percentiles) { + res.emplace_back(p, SampleValue(0)); + } + return res; + } + + // Notify listeners of collected samples + void dispatchSamples( + const Config& config, + const std::vector>& loggers, + int report_nr); + + void eraseSamples(int count) { + for (auto& pair : events_) { + pair.second.eraseSamples(count); + } + } + + void updateLoggers(Config& config, Config* on_demand_config); + + // Print all collected samples since last clear. + void printAllSamples(std::ostream& s, CUdevice device) const; + + // Calls to CUPTI is encapsulated behind these interfaces + std::unique_ptr cuptiEvents_; + std::unique_ptr cuptiMetrics_; + // The CUpti API reports event IDs, we must map them to our event objects + std::map events_; + // List of metrics + std::vector metrics_; + // The countert sets needed to collect all counters + std::vector sets_; + // The event group set object returned by Cupti. + // Saved s.t. we can call cuptiEventGroupSetsDestroy to free memory when + // the object is no longer needed. + CUpti_EventGroupSets* eventGroupSets_ = nullptr; + // Current multiplexed counter set + int curEnabledSet_{0}; + + std::unique_ptr config_; + std::unique_ptr onDemandConfig_; + std::unique_ptr mergedConfig_; + int baseSamples_{0}; + int onDemandSamples_{0}; + + // Shared between profiler threads + // Vectors are read-only but calling loggers require lock + const std::vector>& loggers_; + const std::vector>& onDemandLoggers_; +}; + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/EventProfilerController.cpp b/tb_plugins/profiling/libkineto/src/EventProfilerController.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0427cc7a90cbc49d31262bcce63f1f81c5b6293f --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/EventProfilerController.cpp @@ -0,0 +1,423 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include "EventProfilerController.h" + +#include +#include +#include + +#include "ConfigLoader.h" +#include "CuptiEventApi.h" +#include "CuptiMetricApi.h" +#include "EventProfiler.h" +#include "output_csv.h" + +#include "Logger.h" +#include "ThreadUtil.h" + +using namespace std::chrono; +using std::unique_ptr; +using std::vector; + +namespace KINETO_NAMESPACE { + +namespace { + +vector(const Config&)>>& +loggerFactories() { + static vector(const Config&)>> + factories; + return factories; +} + +vector(const Config&)>>& +onDemandLoggerFactories() { + static vector(const Config&)>> + factories; + return factories; +} + +vector> makeLoggers(const Config& config) { + vector> loggers; + for (const auto& factory : loggerFactories()) { + loggers.push_back(factory(config)); + } + loggers.push_back(std::make_unique()); + loggers.push_back(std::make_unique()); + return loggers; +} + +vector> makeOnDemandLoggers( + const Config& config) { + vector> loggers; + for (const auto& factory : onDemandLoggerFactories()) { + loggers.push_back(factory(config)); + } + loggers.push_back(std::make_unique()); + return loggers; +} + +vector>& loggers(const Config& config) { + static auto res = makeLoggers(config); + return res; +} + +vector>& onDemandLoggers( + const Config& config) { + static auto res = makeOnDemandLoggers(config); + return res; +} + +} // anon namespace + +// Keep an eye on profiling threads. +// We've observed deadlocks in Cuda11 in libcuda / libcupti.. +namespace detail { + +class HeartbeatMonitor { + + public: + ~HeartbeatMonitor() { + stopMonitoring(); + } + + static HeartbeatMonitor& instance() { + static HeartbeatMonitor monitor; + return monitor; + } + + void profilerHeartbeat() { + int32_t tid = systemThreadId(); + std::lock_guard lock(mutex_); + profilerAliveMap_[tid]++; + } + + void setPeriod(seconds period) { + { + std::lock_guard lock(mutex_); + if (period_ == period) { + return; + } + period_ = period; + } + if (period == seconds(0)) { + stopMonitoring(); + } else { + startMonitoring(); + } + } + + private: + HeartbeatMonitor() = default; + + void monitorLoop() { + std::unique_lock lock(mutex_); + while(!stopMonitor_) { + auto cv_status = condVar_.wait_for(lock, seconds(period_)); + // Don't perform check on spurious wakeup or on notify + if (cv_status == std::cv_status::timeout) { + for (auto& pair : profilerAliveMap_) { + int32_t tid = pair.first; + int& i = pair.second; + if (i == 0) { + LOG(ERROR) << "Thread " << tid << " appears stuck!"; + } + i = 0; + } + } + } + } + + void startMonitoring() { + if (!monitorThread_) { + VLOG(0) << "Starting monitoring thread"; + stopMonitor_ = false; + monitorThread_ = std::make_unique( + &HeartbeatMonitor::monitorLoop, this); + } + } + + void stopMonitoring() { + if (monitorThread_) { + VLOG(0) << "Stopping monitoring thread"; + stopMonitor_ = true; + condVar_.notify_one(); + monitorThread_->join(); + monitorThread_ = nullptr; + VLOG(0) << "Monitoring thread terminated"; + } + } + + std::map profilerAliveMap_; + std::unique_ptr monitorThread_; + std::mutex mutex_; + std::condition_variable condVar_; + std::atomic_bool stopMonitor_{false}; + seconds period_{0}; +}; + +} // namespace detail + +namespace { +// Profiler map singleton +std::map>& profilerMap() { + static std::map> instance; + return instance; +} + +void reportLateSample( + int sleepMs, + int sampleMs, + int reportMs, + int reprogramMs) { + LOG_EVERY_N(WARNING, 10) << "Lost sample due to delays (ms): " << sleepMs + << ", " << sampleMs << ", " << reportMs << ", " + << reprogramMs; +} + +void configureHeartbeatMonitor( + detail::HeartbeatMonitor& monitor, const Config& base, const Config* onDemand) { + seconds base_period = + base.eventProfilerHeartbeatMonitorPeriod(); + seconds on_demand_period = !onDemand ? seconds(0) : + onDemand->eventProfilerHeartbeatMonitorPeriod(); + monitor.setPeriod( + on_demand_period > seconds(0) ? on_demand_period : base_period); +} + +} // anon namespace + +void EventProfilerController::addLoggerFactory( + std::function(const Config&)> factory) { + loggerFactories().push_back(factory); +} + +void EventProfilerController::addOnDemandLoggerFactory( + std::function(const Config&)> factory) { + onDemandLoggerFactories().push_back(factory); +} + +EventProfilerController::EventProfilerController( + CUcontext context, + ConfigLoader& configLoader, + detail::HeartbeatMonitor& heartbeatMonitor) + : configLoader_(configLoader), heartbeatMonitor_(heartbeatMonitor) { + auto cupti_events = std::make_unique(context); + auto cupti_metrics = + std::make_unique(cupti_events->device()); + configLoader_.addHandler( + ConfigLoader::ConfigKind::EventProfiler, this); + auto config = configLoader.getConfigCopy(); + profiler_ = std::make_unique( + std::move(cupti_events), + std::move(cupti_metrics), + loggers(*config), + onDemandLoggers(*config)); + profilerThread_ = std::make_unique( + &EventProfilerController::profilerLoop, this); +} + +EventProfilerController::~EventProfilerController() { + if (profilerThread_) { + // signaling termination of the profiler loop + stopRunloop_ = true; + profilerThread_->join(); + } + configLoader_.removeHandler( + ConfigLoader::ConfigKind::EventProfiler, this); + VLOG(0) << "Stopped event profiler"; +} + +// Must be called under lock +void EventProfilerController::start(CUcontext ctx, ConfigLoader& configLoader) { + profilerMap()[ctx] = unique_ptr( + new EventProfilerController( + ctx, configLoader, detail::HeartbeatMonitor::instance())); +} + +// Must be called under lock +void EventProfilerController::stop(CUcontext ctx) { + profilerMap()[ctx] = nullptr; +} + +bool EventProfilerController::canAcceptConfig() { + std::lock_guard guard(mutex_); + return !newOnDemandConfig_; +} + +void EventProfilerController::acceptConfig(const Config& config) { + if (config.eventProfilerOnDemandDuration().count() == 0) { + // Ignore - not for this profiler + return; + } + std::lock_guard guard(mutex_); + if (newOnDemandConfig_) { + LOG(ERROR) << "On demand request already queued - ignoring new request"; + return; + } + newOnDemandConfig_ = config.clone(); + LOG(INFO) << "Received new on-demand config"; +} + +bool EventProfilerController::enableForDevice(Config& cfg) { + // FIXME: Use device unique id! + if (!cfg.eventProfilerEnabledForDevice(profiler_->device())) { + return false; + } + // context count includes the new context + int instances = configLoader_.contextCountForGpu(profiler_->device()); + VLOG(0) << "Device context count: " << instances; + return instances >= 0 && instances <= cfg.maxEventProfilersPerGpu(); +} + +void EventProfilerController::profilerLoop() { + // We limit the number of profilers that can exist per GPU + auto config = configLoader_.getConfigCopy(); + if (!enableForDevice(*config)) { + VLOG(0) << "Not starting EventProfiler - profilers for GPU " + << profiler_->device() << " exceeds profilers per GPU limit (" + << config->maxEventProfilersPerGpu() << ")"; + return; + } + + if (!profiler_->setContinuousMode()) { + VLOG(0) << "Continuous mode not supported for GPU " + << profiler_->device() << ". Not starting Event Profiler."; + return; + } + + VLOG(0) << "Starting Event Profiler for GPU " << profiler_->device(); + setThreadName("CUPTI Event Profiler"); + + time_point next_sample_time; + time_point next_report_time; + time_point next_on_demand_report_time; + time_point next_multiplex_time; + std::unique_ptr on_demand_config = nullptr; + bool reconfigure = true; + bool restart = true; + int report_count = 0; + int on_demand_report_count = 0; + while (!stopRunloop_) { + heartbeatMonitor_.profilerHeartbeat(); + if (configLoader_.hasNewConfig(*config)) { + config = configLoader_.getConfigCopy(); + VLOG(0) << "Base config changed"; + report_count = 0; + reconfigure = true; + } + + auto now = system_clock::now(); + if (on_demand_config && + now > (on_demand_config->eventProfilerOnDemandStartTime() + + on_demand_config->eventProfilerOnDemandDuration())) { + on_demand_config = nullptr; + LOG(INFO) << "On-demand profiling complete"; + reconfigure = true; + } + + if (!profiler_->isOnDemandActive()) { + std::lock_guard lock(mutex_); + if (newOnDemandConfig_) { + VLOG(0) << "Received on-demand config, reconfiguring"; + on_demand_config = std::move(newOnDemandConfig_); + reconfigure = true; + on_demand_report_count = 0; + } + } + + if (reconfigure) { + try { + profiler_->configure(*config, on_demand_config.get()); + } catch (const std::exception& ex) { + LOG(ERROR) << "Encountered error while configuring event profiler: " + << ex.what(); + // Exit profiling entirely when encountering an error here + // as it indicates a serious problem or bug. + break; + } + configureHeartbeatMonitor( + heartbeatMonitor_, *config, on_demand_config.get()); + reconfigure = false; + restart = true; + } + + if (restart) { + now = system_clock::now(); + next_sample_time = now + profiler_->samplePeriod(); + next_report_time = now + profiler_->reportPeriod(); + if (profiler_->isOnDemandActive()) { + next_on_demand_report_time = now + profiler_->onDemandReportPeriod(); + } + next_multiplex_time = now + profiler_->multiplexPeriod(); + // Collect an initial sample and throw it away + // The next sample is the first valid one + profiler_->collectSample(); + profiler_->clearSamples(); + restart = false; + } + + auto start_sleep = now; + while (now < next_sample_time) { + /* sleep override */ + std::this_thread::sleep_for(next_sample_time - now); + now = system_clock::now(); + } + int sleep_time = duration_cast(now - start_sleep).count(); + + auto start_sample = now; + profiler_->collectSample(); + now = system_clock::now(); + int sample_time = duration_cast(now - start_sample).count(); + + next_sample_time += profiler_->samplePeriod(); + if (now > next_sample_time) { + reportLateSample(sleep_time, sample_time, 0, 0); + restart = true; + continue; + } + + auto start_report = now; + if (now > next_report_time) { + VLOG(1) << "Report #" << report_count++; + profiler_->reportSamples(); + next_report_time += profiler_->reportPeriod(); + } + if (profiler_->isOnDemandActive() && now > next_on_demand_report_time) { + VLOG(1) << "OnDemand Report #" << on_demand_report_count++; + profiler_->reportOnDemandSamples(); + next_on_demand_report_time += profiler_->onDemandReportPeriod(); + } + profiler_->eraseReportedSamples(); + now = system_clock::now(); + int report_time = duration_cast(now - start_report).count(); + + if (now > next_sample_time) { + reportLateSample(sleep_time, sample_time, report_time, 0); + restart = true; + continue; + } + + auto start_multiplex = now; + if (profiler_->multiplexEnabled() && now > next_multiplex_time) { + profiler_->enableNextCounterSet(); + next_multiplex_time += profiler_->multiplexPeriod(); + } + now = system_clock::now(); + int multiplex_time = + duration_cast(now - start_multiplex).count(); + + if (now > next_sample_time) { + reportLateSample(sleep_time, sample_time, report_time, multiplex_time); + restart = true; + } + + VLOG(0) << "Runloop execution time: " + << duration_cast(now - start_sample).count() << "ms"; + } + + VLOG(0) << "Device " << profiler_->device() + << ": Exited event profiling loop"; +} + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/EventProfilerController.h b/tb_plugins/profiling/libkineto/src/EventProfilerController.h new file mode 100644 index 0000000000000000000000000000000000000000..007a82faa9289ada9256d09907167471eb6520b9 --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/EventProfilerController.h @@ -0,0 +1,63 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#include +#include +#include +#include +#include + +#include + +#include "ConfigLoader.h" + +namespace KINETO_NAMESPACE { + +class Config; +class ConfigLoader; +class EventProfiler; +class SampleListener; + +namespace detail { +class HeartbeatMonitor; +} + +class EventProfilerController : public ConfigLoader::ConfigHandler { + public: + EventProfilerController(const EventProfilerController&) = delete; + EventProfilerController& operator=(const EventProfilerController&) = delete; + + ~EventProfilerController(); + + static void start(CUcontext ctx, ConfigLoader& configLoader); + static void stop(CUcontext ctx); + + static void addLoggerFactory( + std::function(const Config&)> factory); + + static void addOnDemandLoggerFactory( + std::function(const Config&)> factory); + + bool canAcceptConfig() override; + + void acceptConfig(const Config& config) override; + + private: + explicit EventProfilerController( + CUcontext context, + ConfigLoader& configLoader, + detail::HeartbeatMonitor& heartbeatMonitor); + bool enableForDevice(Config& cfg); + void profilerLoop(); + + ConfigLoader& configLoader_; + std::unique_ptr newOnDemandConfig_; + detail::HeartbeatMonitor& heartbeatMonitor_; + std::unique_ptr profiler_; + std::unique_ptr profilerThread_; + std::atomic_bool stopRunloop_{false}; + std::mutex mutex_; +}; + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/GenericTraceActivity.cpp b/tb_plugins/profiling/libkineto/src/GenericTraceActivity.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4e00b1256c4fa301e288e619ee9ef8c56c8b8569 --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/GenericTraceActivity.cpp @@ -0,0 +1,10 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include "GenericTraceActivity.h" +#include "output_base.h" + +namespace libkineto { + void GenericTraceActivity::log(ActivityLogger& logger) const { + logger.handleGenericActivity(*this); + } +} // namespace libkineto diff --git a/tb_plugins/profiling/libkineto/src/ILoggerObserver.cpp b/tb_plugins/profiling/libkineto/src/ILoggerObserver.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f0106578811837c9cc677def30d5697d43a94221 --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/ILoggerObserver.cpp @@ -0,0 +1,54 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +// TODO(T90238193) +// @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude +#include "ILoggerObserver.h" + +#if !USE_GOOGLE_LOG + +#include +#include + +namespace libkineto { + +struct LoggerTypeName { + constexpr LoggerTypeName(const char* n, LoggerOutputType t) : name(n), type(t) {}; + const char* name; + LoggerOutputType type; +}; + +static constexpr std::array LoggerMap{{ + {"VERBOSE", LoggerOutputType::VERBOSE}, + {"INFO", LoggerOutputType::INFO}, + {"WARNING", LoggerOutputType::WARNING}, + {"ERROR", LoggerOutputType::ERROR}, + {"STAGE", LoggerOutputType::STAGE}, + {"???", LoggerOutputType::ENUM_COUNT} +}}; + +static constexpr bool matchingOrder(int idx = 0) { + return LoggerMap[idx].type == LoggerOutputType::ENUM_COUNT || + ((idx == (int) LoggerMap[idx].type) && matchingOrder(idx + 1)); +} +static_assert(matchingOrder(), "LoggerTypeName map is out of order"); + +const char* toString(LoggerOutputType t) { + if(t < VERBOSE || t >= ENUM_COUNT) { + return LoggerMap[ENUM_COUNT].name; + } + return LoggerMap[(int)t].name; +} + +LoggerOutputType toLoggerOutputType(const std::string& str) { + for (int i = 0; i < LoggerTypeCount; i++) { + if (str == LoggerMap[i].name) { + return LoggerMap[i].type; + } + } + throw std::invalid_argument(fmt::format("Invalid activity type: {}", str)); +} + +} // namespace libkineto + + +#endif // !USE_GOOGLE_LOG diff --git a/tb_plugins/profiling/libkineto/src/Logger.cpp b/tb_plugins/profiling/libkineto/src/Logger.cpp new file mode 100644 index 0000000000000000000000000000000000000000..dbde765f51f7a5f03c31a9c79e6d00ce9a2070b6 --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/Logger.cpp @@ -0,0 +1,136 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +// TODO(T90238193) +// @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude +#include "Logger.h" +#include "ILoggerObserver.h" + +#ifndef USE_GOOGLE_LOG + +#include +#include +#include +#include +#include + +#include +#include + +#include "ThreadUtil.h" + +namespace KINETO_NAMESPACE { + +std::atomic_int Logger::severityLevel_{VERBOSE}; +std::atomic_int Logger::verboseLogLevel_{-1}; +std::atomic Logger::verboseLogModules_{~0ull}; + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wglobal-constructors" +std::mutex Logger::loggerObserversMutex_; +#pragma GCC diagnostic pop + + +Logger::Logger(int severity, int line, const char* filePath, int errnum) + : buf_(), out_(LIBKINETO_DBG_STREAM), errnum_(errnum), messageSeverity_(severity) { + buf_ << toString((LoggerOutputType) severity) << ":"; + + const auto tt = + std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); + const char* file = strrchr(filePath, '/'); + buf_ << fmt::format("{:%Y-%m-%d %H:%M:%S}", fmt::localtime(tt)) << " " + << processId() << ":" << systemThreadId() << " " + << (file ? file + 1 : filePath) << ":" << line << "] "; +} + +Logger::~Logger() { +#ifdef __linux__ + if (errnum_ != 0) { + thread_local char buf[1024]; + buf_ << " : " << strerror_r(errnum_, buf, sizeof(buf)); + } +#endif + + { + std::lock_guard guard(loggerObserversMutex_); + for (auto* observer : loggerObservers()) { + // Output to observers. Current Severity helps keep track of which bucket the output goes. + if (observer) { + observer->write(buf_.str(), (LoggerOutputType) messageSeverity_); + } + } + } + + // Finally, print to terminal or console. + out_ << buf_.str() << std::endl; +} + +void Logger::setVerboseLogModules(const std::vector& modules) { + uint64_t mask = 0; + if (modules.empty()) { + mask = ~0ull; + } else { + for (const std::string& name : modules) { + mask |= hash(name.c_str()); + } + } + verboseLogModules_ = mask; +} + +void Logger::addLoggerObserver(ILoggerObserver* observer) { + if (observer == nullptr) { + return; + } + std::lock_guard guard(loggerObserversMutex_); + loggerObservers().insert(observer); +} + +void Logger::removeLoggerObserver(ILoggerObserver* observer) { + std::lock_guard guard(loggerObserversMutex_); + loggerObservers().erase(observer); +} + +void Logger::addLoggerObserverDevice(int64_t device) { + std::lock_guard guard(loggerObserversMutex_); + for (auto observer : loggerObservers()) { + observer->addDevice(device); + } +} + +void Logger::addLoggerObserverEventCount(int64_t count) { + std::lock_guard guard(loggerObserversMutex_); + for (auto observer : loggerObservers()) { + observer->addEventCount(count); + } +} + +void Logger::setLoggerObserverTraceDurationMS(int64_t duration) { + std::lock_guard guard(loggerObserversMutex_); + for (auto observer : loggerObservers()) { + observer->setTraceDurationMS(duration); + } +} + +void Logger::setLoggerObserverTraceID(const std::string& tid) { + std::lock_guard guard(loggerObserversMutex_); + for (auto observer : loggerObservers()) { + observer->setTraceID(tid); + } +} + +void Logger::setLoggerObserverGroupTraceID(const std::string& gtid) { + std::lock_guard guard(loggerObserversMutex_); + for (auto observer : loggerObservers()) { + observer->setGroupTraceID(gtid); + } +} + +void Logger::addLoggerObserverDestination(const std::string& dest) { + std::lock_guard guard(loggerObserversMutex_); + for (auto observer : loggerObservers()) { + observer->addDestination(dest); + } +} + +} // namespace KINETO_NAMESPACE + +#endif // USE_GOOGLE_LOG diff --git a/tb_plugins/profiling/libkineto/src/Logger.h b/tb_plugins/profiling/libkineto/src/Logger.h new file mode 100644 index 0000000000000000000000000000000000000000..868fc84b9f4ee86d88805bed81468a5df6988257 --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/Logger.h @@ -0,0 +1,244 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#include + +#define LIBKINETO_DBG_STREAM std::cerr + +#if USE_GOOGLE_LOG + +#include + +#define SET_LOG_SEVERITY_LEVEL(level) +#define SET_LOG_VERBOSITY_LEVEL(level, modules) +#define LOGGER_OBSERVER_ADD_DEVICE(device) +#define LOGGER_OBSERVER_ADD_EVENT_COUNT(count) +#define LOGGER_OBSERVER_SET_TRACE_DURATION_MS(duration) +#define LOGGER_OBSERVER_SET_TRACE_ID(tid) +#define LOGGER_OBSERVER_SET_GROUP_TRACE_ID(gtid) +#define LOGGER_OBSERVER_ADD_DESTINATION(dest) +#define UST_LOGGER_MARK_COMPLETED(stage) + +#else // !USE_GOOGLE_LOG +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// TODO(T90238193) +// @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude +#include "ILoggerObserver.h" + +#ifdef _MSC_VER +// unset a predefined ERROR (windows) +#undef ERROR +#endif // _MSC_VER + +namespace KINETO_NAMESPACE { + +class Logger { + public: + Logger(int severity, int line, const char* filePath, int errnum = 0); + ~Logger(); + + inline std::ostream& stream() { + return buf_; + } + + static inline void setSeverityLevel(int level) { + severityLevel_ = level; + } + + static inline int severityLevel() { + return severityLevel_; + } + + static inline void setVerboseLogLevel(int level) { + verboseLogLevel_ = level; + } + + static inline int verboseLogLevel() { + return verboseLogLevel_; + } + + // This is constexpr so that the hash for a file name is computed at compile + // time when used in the VLOG macros. + // This way, there is no string comparison for matching VLOG modules, + // only a comparison of pre-computed hashes. + // No fancy hashing needed here. It's pretty inefficient (one character + // at a time) but the strings are not large and it's not in the critical path. + static constexpr uint64_t rol(uint64_t val, int amount) { + return val << amount | val >> (63 - amount); + } + static constexpr uint64_t hash(const char* s) { + uint64_t hash = hash_rec(s, 0); + return hash & rol(0x41a0240682483014ull, hash & 63); + } + static constexpr uint64_t hash_rec(const char* s, int off) { + // Random constants! + return (!s[off] ? 57ull : (hash_rec(s, off + 1) * 293) ^ s[off]); + } + static constexpr const char* basename(const char* s, int off = 0) { + return !s[off] + ? s + : s[off] == '/' ? basename(&s[off + 1]) : basename(s, off + 1); + } + + static void setVerboseLogModules(const std::vector& modules); + + static inline uint64_t verboseLogModules() { + return verboseLogModules_; + } + + static void clearLoggerObservers() { + std::lock_guard g(loggerObserversMutex_); + loggerObservers().clear(); + } + + static void addLoggerObserver(ILoggerObserver* observer); + + static void removeLoggerObserver(ILoggerObserver* observer); + + static void addLoggerObserverDevice(int64_t device); + + static void addLoggerObserverEventCount(int64_t count); + + static void setLoggerObserverTraceDurationMS(int64_t duration); + + static void setLoggerObserverTraceID(const std::string& tid); + + static void setLoggerObserverGroupTraceID(const std::string& gtid); + + static void addLoggerObserverDestination(const std::string& dest); + + private: + std::stringstream buf_; + std::ostream& out_; + int errnum_; + int messageSeverity_; + static std::atomic_int severityLevel_; + static std::atomic_int verboseLogLevel_; + static std::atomic verboseLogModules_; + static std::set& loggerObservers() { + static auto* inst = new std::set(); + return *inst; + } + static std::mutex loggerObserversMutex_; +}; + +class VoidLogger { + public: + VoidLogger() {} + void operator&(std::ostream&) {} +}; + +} // namespace KINETO_NAMESPACE + +#ifdef LOG // Undefine in case these are already defined (quite likely) +#undef LOG +#undef LOG_IS_ON +#undef LOG_IF +#undef LOG_EVERY_N +#undef LOG_IF_EVERY_N +#undef DLOG +#undef DLOG_IF +#undef VLOG +#undef VLOG_IF +#undef VLOG_EVERY_N +#undef VLOG_IS_ON +#undef DVLOG +#undef LOG_FIRST_N +#undef CHECK +#undef DCHECK +#undef DCHECK_EQ +#undef PLOG +#undef PCHECK +#undef LOG_OCCURRENCES +#endif + +#define LOG_IS_ON(severity) \ + (severity >= libkineto::Logger::severityLevel()) + +#define LOG_IF(severity, condition) \ + !(LOG_IS_ON(severity) && (condition)) ? (void)0 : libkineto::VoidLogger() & \ + libkineto::Logger(severity, __LINE__, __FILE__).stream() + +#define LOG(severity) LOG_IF(severity, true) + +#define LOCAL_VARNAME_CONCAT(name, suffix) _##name##suffix##_ + +#define LOCAL_VARNAME(name) LOCAL_VARNAME_CONCAT(name, __LINE__) + +#define LOG_OCCURRENCES LOCAL_VARNAME(log_count) + +#define LOG_EVERY_N(severity, rate) \ + static int LOG_OCCURRENCES = 0; \ + LOG_IF(severity, LOG_OCCURRENCES++ % rate == 0) \ + << "(x" << LOG_OCCURRENCES << ") " + +template +struct __to_constant__ { + static const uint64_t val = n; +}; +#define FILENAME_HASH \ + __to_constant__::val +#define VLOG_IS_ON(verbosity) \ + (libkineto::Logger::verboseLogLevel() >= verbosity && \ + (libkineto::Logger::verboseLogModules() & FILENAME_HASH) == FILENAME_HASH) + +#define VLOG_IF(verbosity, condition) \ + LOG_IF(VERBOSE, VLOG_IS_ON(verbosity) && (condition)) + +#define VLOG(verbosity) VLOG_IF(verbosity, true) + +#define VLOG_EVERY_N(verbosity, rate) \ + static int LOG_OCCURRENCES = 0; \ + VLOG_IF(verbosity, LOG_OCCURRENCES++ % rate == 0) \ + << "(x" << LOG_OCCURRENCES << ") " + +#define PLOG(severity) \ + libkineto::Logger(severity, __LINE__, __FILE__, errno).stream() + +#define SET_LOG_SEVERITY_LEVEL(level) \ + libkineto::Logger::setSeverityLevel(level) + +#define SET_LOG_VERBOSITY_LEVEL(level, modules) \ + libkineto::Logger::setVerboseLogLevel(level); \ + libkineto::Logger::setVerboseLogModules(modules) + +// Logging the set of devices the trace is collect on. +#define LOGGER_OBSERVER_ADD_DEVICE(device_count) \ + libkineto::Logger::addLoggerObserverDevice(device_count) + +// Incrementing the number of events collected by this trace. +#define LOGGER_OBSERVER_ADD_EVENT_COUNT(count) \ + libkineto::Logger::addLoggerObserverEventCount(count) + +// Record duration of trace in milliseconds. +#define LOGGER_OBSERVER_SET_TRACE_DURATION_MS(duration) \ + libkineto::Logger::setLoggerObserverTraceDurationMS(duration) + +// Record the trace id when given. +#define LOGGER_OBSERVER_SET_TRACE_ID(tid) \ + libkineto::Logger::setLoggerObserverTraceID(tid) + +// Record the group trace id when given. +#define LOGGER_OBSERVER_SET_GROUP_TRACE_ID(gtid) \ + libkineto::Logger::setLoggerObserverGroupTraceID(gtid) + +// Log the set of destinations the trace is sent to. +#define LOGGER_OBSERVER_ADD_DESTINATION(dest) \ + libkineto::Logger::addLoggerObserverDestination(dest) + +// UST Logger Semantics to describe when a stage is complete. +#define UST_LOGGER_MARK_COMPLETED(stage) \ + LOG(libkineto::LoggerOutputType::STAGE) << "Completed Stage: " << stage + +#endif // USE_GOOGLE_LOG diff --git a/tb_plugins/profiling/libkineto/src/LoggerCollector.h b/tb_plugins/profiling/libkineto/src/LoggerCollector.h new file mode 100644 index 0000000000000000000000000000000000000000..bb05aab218dc137cfe2f0107694a049ee2ea6508 --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/LoggerCollector.h @@ -0,0 +1,70 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#if !USE_GOOGLE_LOG + +#include +#include +#include +#include + +// TODO(T90238193) +// @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude +#include "ILoggerObserver.h" + +namespace KINETO_NAMESPACE { + +using namespace libkineto; + +class LoggerCollector : public ILoggerObserver { + public: + LoggerCollector() : buckets_() {} + + void write(const std::string& message, LoggerOutputType ot = ERROR) override { + // Skip STAGE output type which is only used by USTLoggerCollector. + if (ot != STAGE) { + buckets_[ot].push_back(message); + } + } + + const std::map> extractCollectorMetadata() override { + return buckets_; + } + + void reset() override { + trace_duration_ms = 0; + event_count = 0; + destinations.clear(); + } + + void addDevice(const int64_t device) override { + devices.insert(device); + } + + void setTraceDurationMS(const int64_t duration) override { + trace_duration_ms = duration; + } + + void addEventCount(const int64_t count) override { + event_count += count; + } + + void addDestination(const std::string& dest) override { + destinations.insert(dest); + } + + protected: + std::map> buckets_; + + // These are useful metadata to collect from CUPTIActivityProfiler for internal tracking. + std::set devices; + int64_t trace_duration_ms{0}; + std::atomic event_count{0}; + std::set destinations; + +}; + +} // namespace KINETO_NAMESPACE + +#endif // !USE_GOOGLE_LOG diff --git a/tb_plugins/profiling/libkineto/src/RoctracerActivityApi.cpp b/tb_plugins/profiling/libkineto/src/RoctracerActivityApi.cpp new file mode 100644 index 0000000000000000000000000000000000000000..73eff13e2a08bcfecefb03f5b229bde89b7e96cb --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/RoctracerActivityApi.cpp @@ -0,0 +1,569 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include "RoctracerActivityApi.h" + +#include +#include +#include + +#include "Demangle.h" +#include "output_base.h" +#include "ThreadUtil.h" + +typedef uint64_t timestamp_t; + +static timestamp_t timespec_to_ns(const timespec& time) { + return ((timestamp_t)time.tv_sec * 1000000000) + time.tv_nsec; + } + +using namespace std::chrono; + +namespace KINETO_NAMESPACE { + +constexpr size_t kBufSize(2 * 1024 * 1024); + +RoctracerActivityApi& RoctracerActivityApi::singleton() { + static RoctracerActivityApi instance; + return instance; +} + +RoctracerActivityApi::RoctracerActivityApi() { + gpuTraceBuffers_ = std::make_unique>(); +} + +RoctracerActivityApi::~RoctracerActivityApi() { + disableActivities(std::set()); + endTracing(); +} + +void RoctracerActivityApi::pushCorrelationID(int id, CorrelationFlowType type) { +#ifdef HAS_ROCTRACER + if (!singleton().externalCorrelationEnabled_) { + return; + } + // placeholder +#endif +} + +void RoctracerActivityApi::popCorrelationID(CorrelationFlowType type) { +#ifdef HAS_ROCTRACER + if (!singleton().externalCorrelationEnabled_) { + return; + } + // placeholder +#endif +} + +void RoctracerActivityApi::setMaxBufferSize(int size) { + maxGpuBufferCount_ = 1 + size / kBufSize; +} + +int RoctracerActivityApi::processActivities( + ActivityLogger& logger) { + // Find offset to map from monotonic clock to system clock. + // This will break time-ordering of events but is status quo. + + timespec t0, t1, t00; + clock_gettime(CLOCK_REALTIME, &t0); + clock_gettime(CLOCK_MONOTONIC, &t1); + clock_gettime(CLOCK_REALTIME, &t00); + + const timestamp_t toffset = (timespec_to_ns(t0) >> 1) + (timespec_to_ns(t00) >> 1) - timespec_to_ns(t1); + + int count = 0; + + // Basic Api calls + + for (auto &item : rows_) { + GenericTraceActivity a; + a.startTime = (item.begin + toffset) / 1000; + a.endTime = (item.end + toffset) / 1000; + a.id = item.id; + a.device = item.pid; + a.resource = item.tid; + a.activityType = ActivityType::CUDA_RUNTIME; + a.activityName = std::string(roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, item.cid, 0)); + a.flow.id = item.id; + a.flow.type = kLinkAsyncCpuGpu; + a.flow.start = true; + + logger.handleGenericActivity(a); + ++count; + } + + // Malloc/Free calls + for (auto &item : mallocRows_) { + GenericTraceActivity a; + a.startTime = (item.begin + toffset) / 1000; + a.endTime = (item.end + toffset) / 1000; + a.id = item.id; + a.device = item.pid; + a.resource = item.tid; + a.activityType = ActivityType::CUDA_RUNTIME; + a.activityName = std::string(roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, item.cid, 0)); + a.flow.id = item.id; + a.flow.type = kLinkAsyncCpuGpu; + a.flow.start = true; + + a.addMetadata("ptr", item.ptr); + if (item.cid == HIP_API_ID_hipMalloc) { + a.addMetadata("size", item.size); + } + + logger.handleGenericActivity(a); + ++count; + } + + // HipMemcpy calls + for (auto &item : copyRows_) { + GenericTraceActivity a; + a.startTime = (item.begin + toffset) / 1000; + a.endTime = (item.end + toffset) / 1000; + a.id = item.id; + a.device = item.pid; + a.resource = item.tid; + a.activityType = ActivityType::CUDA_RUNTIME; + a.activityName = std::string(roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, item.cid, 0)); + a.flow.id = item.id; + a.flow.type = kLinkAsyncCpuGpu; + a.flow.start = true; + + a.addMetadata("src", item.src); + a.addMetadata("dst", item.dst); + a.addMetadata("size", item.size); + a.addMetadata("kind", item.kind); + if ((item.cid == HIP_API_ID_hipMemcpyAsync) || (item.cid == HIP_API_ID_hipMemcpyWithStream)) { + a.addMetadata("stream", fmt::format("{}", reinterpret_cast(item.stream))); + } + + logger.handleGenericActivity(a); + ++count; + } + + // Kernel Launch Api calls + + for (auto &item : kernelRows_) { + GenericTraceActivity a; + a.startTime = (item.begin + toffset) / 1000; + a.endTime = (item.end + toffset) / 1000; + a.id = item.id; + a.device = item.pid; + a.resource = item.tid; + a.activityType = ActivityType::CUDA_RUNTIME; + a.activityName = std::string(roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, item.cid, 0)); + a.flow.id = item.id; + a.flow.type = kLinkAsyncCpuGpu; + a.flow.start = true; + + if (item.functionAddr != nullptr) { + a.addMetadataQuoted( + "kernel", demangle(hipKernelNameRefByPtr(item.functionAddr, item.stream))); + } + else if (item.function != nullptr) { + a.addMetadataQuoted( + "kernel", demangle(hipKernelNameRef(item.function))); + } + a.addMetadata("grid dim", fmt::format("[{}, {}, {}]", item.gridX, item.gridY, item.gridZ)); + a.addMetadata("block dim", fmt::format("[{}, {}, {}]", item.workgroupX, item.workgroupY, item.workgroupZ)); + a.addMetadata("shared size", item.groupSegmentSize); + a.addMetadata("stream", fmt::format("{}", reinterpret_cast(item.stream))); + + // Stash launches to tie to the async ops + kernelLaunches_[a.id] = a; + + // Stash kernel names to tie to the async ops + std::string name; + if (item.functionAddr != nullptr) { + name = demangle(hipKernelNameRefByPtr(item.functionAddr, item.stream)); + } + else if (item.function != nullptr) { + name = demangle(hipKernelNameRef(item.function)); + } + if (!name.empty()) { + uint32_t string_id = reverseStrings_[name]; + if (string_id == 0) { + string_id = nextStringId_++; + reverseStrings_[name] = string_id; + strings_[string_id] = name; + } + kernelNames_[item.id] = string_id; + } + + logger.handleGenericActivity(a); + ++count; + } + + // Async Ops + + for (auto& buffer : *gpuTraceBuffers_) { + const roctracer_record_t* record = (const roctracer_record_t*)(buffer.data); + const roctracer_record_t* end_record = (const roctracer_record_t*)(buffer.data + buffer.validSize); + GenericTraceActivity a; + + while (record < end_record) { + if ((record->domain == ACTIVITY_DOMAIN_HIP_API) && (loggedIds_.contains(record->op))) { + const char *name = roctracer_op_string(record->domain, record->op, record->kind); + a.device = record->process_id; + a.resource = record->thread_id; + + a.startTime = (record->begin_ns + toffset) / 1000; + a.endTime = (record->end_ns + toffset) / 1000; + a.id = record->correlation_id; + + a.activityType = ActivityType::CUDA_RUNTIME; + a.activityName = std::string(name); + a.flow.id = record->correlation_id; + a.flow.type = kLinkAsyncCpuGpu; + a.flow.start = true; + + logger.handleGenericActivity(a); + ++count; + } + else if (record->domain == ACTIVITY_DOMAIN_HCC_OPS) { + // Overlay launch metadata for kernels + auto kit = kernelLaunches_.find(record->correlation_id); + if (kit != kernelLaunches_.end()) { + a = (*kit).second; + } + + const char *name = roctracer_op_string(record->domain, record->op, record->kind); + a.device = record->device_id; + a.resource = record->queue_id; + + a.startTime = (record->begin_ns + toffset) / 1000; + a.endTime = (record->end_ns + toffset) / 1000; + a.id = record->correlation_id; + + a.activityType = ActivityType::CONCURRENT_KERNEL; + a.activityName = std::string(name); + a.flow.id = record->correlation_id; + a.flow.type = kLinkAsyncCpuGpu; + + auto it = kernelNames_.find(record->correlation_id); + if (it != kernelNames_.end()) { + a.activityName = strings_[it->second]; + } + + logger.handleGenericActivity(a); + ++count; + } + + roctracer_next_record(record, &record); + } + } + return count; +} + +void RoctracerActivityApi::clearActivities() { + gpuTraceBuffers_->clear(); + rows_.clear(); + kernelRows_.clear(); + copyRows_.clear(); + mallocRows_.clear(); + kernelLaunches_.clear(); +} + +void RoctracerActivityApi::api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* arg) +{ + RoctracerActivityApi *dis = &singleton(); + + if (domain == ACTIVITY_DOMAIN_HIP_API && dis->loggedIds_.contains(cid)) { + const hip_api_data_t* data = (const hip_api_data_t*)(callback_data); + + // Pack callbacks into row structures + + static timespec timestamp; // FIXME verify thread safety + + if (data->phase == ACTIVITY_API_PHASE_ENTER) { + clock_gettime(CLOCK_MONOTONIC, ×tamp); // record proper clock + } + else { // (data->phase == ACTIVITY_API_PHASE_EXIT) + timespec endTime; + timespec startTime { timestamp }; + clock_gettime(CLOCK_MONOTONIC, &endTime); // record proper clock + + switch (cid) { + case HIP_API_ID_hipLaunchKernel: + case HIP_API_ID_hipExtLaunchKernel: + case HIP_API_ID_hipLaunchCooperativeKernel: // Should work here + { + auto &args = data->args.hipLaunchKernel; + dis->kernelRows_.emplace_back(data->correlation_id, + domain, + cid, + processId(), + systemThreadId(), + timespec_to_ns(startTime), + timespec_to_ns(endTime), + args.function_address, + nullptr, + args.numBlocks.x, + args.numBlocks.y, + args.numBlocks.z, + args.dimBlocks.x, + args.dimBlocks.y, + args.dimBlocks.z, + args.sharedMemBytes, + args.stream + ); + } + break; + case HIP_API_ID_hipHccModuleLaunchKernel: + case HIP_API_ID_hipModuleLaunchKernel: + case HIP_API_ID_hipExtModuleLaunchKernel: + { + auto &args = data->args.hipModuleLaunchKernel; + dis->kernelRows_.emplace_back(data->correlation_id, + domain, + cid, + processId(), + systemThreadId(), + timespec_to_ns(startTime), + timespec_to_ns(endTime), + nullptr, + args.f, + args.gridDimX, + args.gridDimY, + args.gridDimZ, + args.blockDimX, + args.blockDimY, + args.blockDimZ, + args.sharedMemBytes, + args.stream + ); + } + break; + case HIP_API_ID_hipLaunchCooperativeKernelMultiDevice: + case HIP_API_ID_hipExtLaunchMultiKernelMultiDevice: +#if 0 + { + auto &args = data->args.hipLaunchCooperativeKernelMultiDevice.launchParamsList__val; + dis->kernelRows_.emplace_back(data->correlation_id, + domain, + cid, + processId(), + systemThreadId(), + timespec_to_ns(startTime), + timespec_to_ns(endTime), + args.function_address, + nullptr, + args.numBlocks.x, + args.numBlocks.y, + args.numBlocks.z, + args.dimBlocks.x, + args.dimBlocks.y, + args.dimBlocks.z, + args.sharedMemBytes, + args.stream + ); + } +#endif + break; + case HIP_API_ID_hipMalloc: + dis->mallocRows_.emplace_back(data->correlation_id, + domain, + cid, + processId(), + systemThreadId(), + timespec_to_ns(startTime), + timespec_to_ns(endTime), + data->args.hipMalloc.ptr__val, + data->args.hipMalloc.size + ); + break; + case HIP_API_ID_hipFree: + dis->mallocRows_.emplace_back(data->correlation_id, + domain, + cid, + processId(), + systemThreadId(), + timespec_to_ns(startTime), + timespec_to_ns(endTime), + data->args.hipFree.ptr, + 0 + ); + break; + case HIP_API_ID_hipMemcpy: + { + auto &args = data->args.hipMemcpy; + dis->copyRows_.emplace_back(data->correlation_id, + domain, + cid, + processId(), + systemThreadId(), + timespec_to_ns(startTime), + timespec_to_ns(endTime), + args.src, + args.dst, + args.sizeBytes, + args.kind, + static_cast(0) // use placeholder? + ); + } + break; + case HIP_API_ID_hipMemcpyAsync: + case HIP_API_ID_hipMemcpyWithStream: + { + auto &args = data->args.hipMemcpyAsync; + dis->copyRows_.emplace_back(data->correlation_id, + domain, + cid, + processId(), + systemThreadId(), + timespec_to_ns(startTime), + timespec_to_ns(endTime), + args.src, + args.dst, + args.sizeBytes, + args.kind, + args.stream + ); + } + break; + default: + dis->rows_.emplace_back(data->correlation_id, + domain, + cid, + processId(), + systemThreadId(), + timespec_to_ns(startTime), + timespec_to_ns(endTime) + ); + break; + } + } + } +} + +void RoctracerActivityApi::activity_callback(const char* begin, const char* end, void* arg) +{ + size_t size = end - begin; + uint8_t *buffer = (uint8_t*) malloc(size); + auto &gpuTraceBuffers = singleton().gpuTraceBuffers_; + memcpy(buffer, begin, size); + gpuTraceBuffers->emplace_back(buffer, size); +} + +void RoctracerActivityApi::enableActivities( + const std::set& selected_activities) { +#ifdef HAS_ROCTRACER + if (!registered_) { + roctracer_set_properties(ACTIVITY_DOMAIN_HIP_API, nullptr); // Magic encantation + + // Set some api calls to ignore + loggedIds_.setInvertMode(true); // Omit the specified api + loggedIds_.add("hipGetDevice"); + loggedIds_.add("hipSetDevice"); + loggedIds_.add("hipGetLastError"); + loggedIds_.add("__hipPushCallConfiguration"); + loggedIds_.add("__hipPopCallConfiguration"); + loggedIds_.add("hipCtxSetCurrent"); + loggedIds_.add("hipEventRecord"); + loggedIds_.add("hipEventQuery"); + loggedIds_.add("hipGetDeviceProperties"); + loggedIds_.add("hipPeekAtLastError"); + loggedIds_.add("hipModuleGetFunction"); + loggedIds_.add("hipEventCreateWithFlags"); + + // Enable API callbacks + if (loggedIds_.invertMode() == true) { + // exclusion list - enable entire domain and turn off things in list + roctracer_enable_domain_callback(ACTIVITY_DOMAIN_HIP_API, api_callback, nullptr); + const std::unordered_map &filter = loggedIds_.filterList(); + for (auto it = filter.begin(); it != filter.end(); ++it) { + roctracer_disable_op_callback(ACTIVITY_DOMAIN_HIP_API, it->first); + } + } + else { + // inclusion list - only enable things in the list + const std::unordered_map &filter = loggedIds_.filterList(); + roctracer_disable_domain_callback(ACTIVITY_DOMAIN_HIP_API); + for (auto it = filter.begin(); it != filter.end(); ++it) { + roctracer_enable_op_callback(ACTIVITY_DOMAIN_HIP_API, it->first, api_callback, nullptr); + } + } + //roctracer_enable_domain_callback(ACTIVITY_DOMAIN_ROCTX, api_callback, nullptr); + + // Allocate default tracing pool + roctracer_properties_t properties; + memset(&properties, 0, sizeof(roctracer_properties_t)); + properties.buffer_size = 0x1000; + roctracer_open_pool(&properties); + + // Enable async op collection + roctracer_properties_t hcc_cb_properties; + memset(&hcc_cb_properties, 0, sizeof(roctracer_properties_t)); + hcc_cb_properties.buffer_size = 0x4000; + hcc_cb_properties.buffer_callback_fun = activity_callback; + roctracer_open_pool_expl(&hcc_cb_properties, &hccPool_); + roctracer_enable_domain_activity_expl(ACTIVITY_DOMAIN_HCC_OPS, hccPool_); + + registered_ = true; + } + + for (const auto& activity : selected_activities) { + if (activity == ActivityType::EXTERNAL_CORRELATION) { + externalCorrelationEnabled_ = true; + } + } + + roctracer_start(); +#endif +} + +void RoctracerActivityApi::disableActivities( + const std::set& selected_activities) { +#ifdef HAS_ROCTRACER + roctracer_stop(); + roctracer_flush_activity_expl(hccPool_); + + for (const auto& activity : selected_activities) { + if (activity == ActivityType::EXTERNAL_CORRELATION) { + externalCorrelationEnabled_ = false; + } + } +#endif +} + +void RoctracerActivityApi::endTracing() { + if (registered_ == true) { + roctracer_disable_domain_callback(ACTIVITY_DOMAIN_HIP_API); + //roctracer_disable_domain_callback(ACTIVITY_DOMAIN_ROCTX); + + roctracer_disable_domain_activity(ACTIVITY_DOMAIN_HCC_OPS); + roctracer_close_pool_expl(hccPool_); + } +} + + +ApiIdList::ApiIdList() +: invert_(true) +{ +} + +void ApiIdList::add(std::string apiName) +{ + uint32_t cid = 0; + if (roctracer_op_code(ACTIVITY_DOMAIN_HIP_API, apiName.c_str(), &cid, nullptr) == ROCTRACER_STATUS_SUCCESS) { + filter_[cid] = 1; + } +} +void ApiIdList::remove(std::string apiName) +{ + uint32_t cid = 0; + if (roctracer_op_code(ACTIVITY_DOMAIN_HIP_API, apiName.c_str(), &cid, nullptr) == ROCTRACER_STATUS_SUCCESS) { + filter_.erase(cid); + } +} + +bool ApiIdList::loadUserPrefs() +{ + // placeholder + return false; +} +bool ApiIdList::contains(uint32_t apiId) +{ + return (filter_.find(apiId) != filter_.end()) ? !invert_ : invert_; // XOR +} + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/RoctracerActivityApi.h b/tb_plugins/profiling/libkineto/src/RoctracerActivityApi.h new file mode 100644 index 0000000000000000000000000000000000000000..28280253e7c8426e85c11d679785bcd74fa2a0c7 --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/RoctracerActivityApi.h @@ -0,0 +1,171 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef HAS_ROCTRACER +#include +#include +#include +#include +#include +#endif + +#include "ActivityType.h" +#include "GenericTraceActivity.h" +#include "RoctracerActivityBuffer.h" + + +namespace KINETO_NAMESPACE { + +using namespace libkineto; + +class ApiIdList +{ +public: + ApiIdList(); + bool invertMode() { return invert_; } + void setInvertMode(bool invert) { invert_ = invert; } + void add(std::string apiName); + void remove(std::string apiName); + bool loadUserPrefs(); + bool contains(uint32_t apiId); + const std::unordered_map &filterList() { return filter_; } + +private: + std::unordered_map filter_; + bool invert_; +}; + +struct roctracerRow { + roctracerRow(uint64_t id, uint32_t domain, uint32_t cid, uint32_t pid + , uint32_t tid, uint64_t begin, uint64_t end) + : id(id), domain(domain), cid(cid), pid(pid), tid(tid), begin(begin), end(end) {} + uint64_t id; // correlation_id + uint32_t domain; + uint32_t cid; + uint32_t pid; + uint32_t tid; + uint64_t begin; + uint64_t end; +}; + +struct kernelRow : public roctracerRow { + kernelRow(uint64_t id, uint32_t domain, uint32_t cid, uint32_t pid + , uint32_t tid, uint64_t begin, uint64_t end + , const void *faddr, hipFunction_t function + , unsigned int gx, unsigned int gy, unsigned int gz + , unsigned int wx, unsigned int wy, unsigned int wz + , size_t gss, hipStream_t stream) + : roctracerRow(id, domain, cid, pid, tid, begin, end), functionAddr(faddr) + , function(function), gridX(gx), gridY(gy), gridZ(gz) + , workgroupX(wx), workgroupY(wy), workgroupZ(wz), groupSegmentSize(gss) + , stream(stream) {} + const void* functionAddr; + hipFunction_t function; + unsigned int gridX; + unsigned int gridY; + unsigned int gridZ; + unsigned int workgroupX; + unsigned int workgroupY; + unsigned int workgroupZ; + size_t groupSegmentSize; + hipStream_t stream; +}; + +struct copyRow : public roctracerRow { + copyRow(uint64_t id, uint32_t domain, uint32_t cid, uint32_t pid + , uint32_t tid, uint64_t begin, uint64_t end + , const void* src, const void *dst, size_t size, hipMemcpyKind kind + , hipStream_t stream) + : roctracerRow(id, domain, cid, pid, tid, begin, end) + , src(src), dst(dst), size(size), kind(kind), stream(stream) {} + const void *src; + const void *dst; + size_t size; + hipMemcpyKind kind; + hipStream_t stream; +}; + +struct mallocRow : public roctracerRow { + mallocRow(uint64_t id, uint32_t domain, uint32_t cid, uint32_t pid + , uint32_t tid, uint64_t begin, uint64_t end + , const void* ptr, size_t size) + : roctracerRow(id, domain, cid, pid, tid, begin, end) + , ptr(ptr), size(size) {} + const void *ptr; + size_t size; +}; + + +class RoctracerActivityApi { + public: + enum CorrelationFlowType { + Default, + User + }; + + RoctracerActivityApi(); + RoctracerActivityApi(const RoctracerActivityApi&) = delete; + RoctracerActivityApi& operator=(const RoctracerActivityApi&) = delete; + + virtual ~RoctracerActivityApi(); + + static RoctracerActivityApi& singleton(); + + static void pushCorrelationID(int id, CorrelationFlowType type); + static void popCorrelationID(CorrelationFlowType type); + + void enableActivities( + const std::set& selected_activities); + void disableActivities( + const std::set& selected_activities); + void clearActivities(); + + int processActivities(ActivityLogger& logger); + + void setMaxBufferSize(int size); + + std::atomic_bool stopCollection{false}; + + private: + bool registered_{false}; + void endTracing(); + +#ifdef HAS_ROCTRACER + roctracer_pool_t *hccPool_{NULL}; + static void api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* arg); + static void activity_callback(const char* begin, const char* end, void* arg); + + //Name cache + uint32_t nextStringId_{2}; + std::map strings_; + std::map reverseStrings_; + std::map kernelNames_; + + ApiIdList loggedIds_; + + // Api callback data + std::deque rows_; + std::deque kernelRows_; + std::deque copyRows_; + std::deque mallocRows_; + std::map kernelLaunches_; +#endif + + int maxGpuBufferCount_{0}; + std::unique_ptr> gpuTraceBuffers_; + bool externalCorrelationEnabled_{true}; +}; + +} // namespace KINETO_NAMESPACE + diff --git a/tb_plugins/profiling/libkineto/src/RoctracerActivityBuffer.h b/tb_plugins/profiling/libkineto/src/RoctracerActivityBuffer.h new file mode 100644 index 0000000000000000000000000000000000000000..cd8a5709a841b7c988ab3f2d1f3108d693343584 --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/RoctracerActivityBuffer.h @@ -0,0 +1,30 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#include +#include +#include +#include + +namespace KINETO_NAMESPACE { + +class RoctracerActivityBuffer { + public: + // data must be allocated using malloc. + // Ownership is transferred to this object. + RoctracerActivityBuffer(uint8_t* data, size_t validSize) + : data(data), validSize(validSize) {} + + ~RoctracerActivityBuffer() { + free(data); + } + + // Allocated by malloc + uint8_t* data{nullptr}; + + // Number of bytes used + size_t validSize; +}; + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/SampleListener.h b/tb_plugins/profiling/libkineto/src/SampleListener.h new file mode 100644 index 0000000000000000000000000000000000000000..bff86ad122a051d4f3dfdbdd329a3b63d93a7c77 --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/SampleListener.h @@ -0,0 +1,146 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#include +#include +#include +#include +#include + +namespace KINETO_NAMESPACE { + +class Config; + +class SampleValue { + public: + template + explicit SampleValue(T v) { + init(v); + } + + SampleValue(const SampleValue&) = default; + SampleValue& operator=(const SampleValue&) = delete; + SampleValue(SampleValue&&) = default; + SampleValue& operator=(SampleValue&&) = default; + + bool isInt() const { + return type_ == INT64; + } + + int64_t getInt() const { + assert(isInt()); + return int_; + } + + bool isDouble() const { + return type_ == DOUBLE; + } + + double getDouble() const { + assert(isDouble()); + return dbl_; + } + + inline void operator*=(double x) { + assert(isDouble() || isInt()); + if (isDouble()) { + dbl_ *= x; + } else { + int_ = std::round(int_ * x); + } + } + + inline bool operator<(const SampleValue& o) const { + if (type_ != o.type_) { + return type_ < o.type_; + } else if (type_ == INT64) { + return int_ < o.int_; + } else if (type_ == DOUBLE) { + return dbl_ < o.dbl_; + } + assert(false); + return true; + } + + void print(std::ostream& s) const { + if (type_ == INT64) { + s << int_; + } else if (type_ == DOUBLE) { + s << dbl_; + } else { + assert(false); + } + } + + private: + enum Type { INT64, DOUBLE }; + + template + void init(T v); + + Type type_{INT64}; + union { + int64_t int_{0}; + double dbl_; + }; +}; + +template <> +inline void SampleValue::init(uint64_t v) { + int_ = v, type_ = INT64; +} +template <> +inline void SampleValue::init(int64_t v) { + int_ = v, type_ = INT64; +} +template <> +inline void SampleValue::init(int v) { + int_ = v, type_ = INT64; +} +template <> +inline void SampleValue::init(double v) { + dbl_ = v, type_ = DOUBLE; +} + +inline std::ostream& operator<<(std::ostream& out, const SampleValue& s) { + s.print(out); + return out; +} + +using PercentileList = std::vector>; + +struct Stat { + const std::string& name; + const PercentileList percentileValues; + SampleValue total; +}; + +struct Sample { + Sample(int stats_count) { + stats.reserve(stats_count); + } + + // Offset in milliseconds from first sample in report + int deltaMsec; + std::vector stats; +}; + +// Inherit from this to be notified of samples +class SampleListener { + public: + SampleListener(const SampleListener&) = delete; + SampleListener& operator=(const SampleListener&) = delete; + + virtual ~SampleListener(){}; + + // Report bucketed & aggregated values for event + virtual void handleSample(int device, const Sample& sample, bool from_new_version) = 0; + + virtual void update(const Config& config) = 0; + + protected: + SampleListener() = default; +}; + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/ScopeExit.h b/tb_plugins/profiling/libkineto/src/ScopeExit.h new file mode 100644 index 0000000000000000000000000000000000000000..b9a6bc83ef942c7fb0e4b198b0396e5d75aa5a3a --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/ScopeExit.h @@ -0,0 +1,29 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +// Implement a simple scope handler allowing a function to release +// resources when an error or exception occurs + +template +class ScopeExit { + public: + explicit ScopeExit(T t) : t(t) {} + ~ScopeExit() { + t(); + } + T t; +}; + +template +ScopeExit makeScopeExit(T t) { + return ScopeExit(t); +}; + +// Add a level of indirection so __LINE__ is expanded +#define __kINETO_CONCAT(name, line) name##line +#define ANON_VAR(name, line) __kINETO_CONCAT(name, line) + +#define SCOPE_EXIT(func) \ + const auto ANON_VAR(SCOPE_BLOCK, __LINE__) = \ + makeScopeExit([=]() { func; }) diff --git a/tb_plugins/profiling/libkineto/src/ThreadUtil.cpp b/tb_plugins/profiling/libkineto/src/ThreadUtil.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0f67d54d58512aa47b05aed69748a6894aa06b1c --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/ThreadUtil.cpp @@ -0,0 +1,203 @@ +#include "ThreadUtil.h" + +#ifndef _MSC_VER +#include +#include +#include +#include +#else // _MSC_VER +#include +#include +#define WIN32_LEAN_AND_MEAN +#define NOGDI +#include +#include +#undef ERROR +#endif // _MSC_VER + +#ifdef __ANDROID__ +#include +#endif + +#include +#include +#include + +namespace libkineto { + +namespace { +thread_local int32_t _pid = 0; +thread_local int32_t _tid = 0; +thread_local int32_t _sysTid = 0; +} + +int32_t processId() { + if (!_pid) { +#ifndef _MSC_VER + _pid = (int32_t)getpid(); +#else + _pid = (int32_t)GetCurrentProcessId(); +#endif + } + return _pid; +} + +int32_t systemThreadId() { + if (!_sysTid) { +#ifdef __APPLE__ + _sysTid = (int32_t)syscall(SYS_thread_selfid); +#elif defined _MSC_VER + _sysTid = (int32_t)GetCurrentThreadId(); +#else + _sysTid = (int32_t)syscall(SYS_gettid); +#endif + } + return _sysTid; +} + +int32_t threadId() { + if (!_tid) { +#ifdef __APPLE__ + uint64_t tid; + pthread_threadid_np(nullptr, &tid); + _tid = tid; +#elif defined _MSC_VER + _tid = (int32_t)GetCurrentThreadId(); +#else + pthread_t pth = pthread_self(); + int32_t* ptr = reinterpret_cast(&pth); + _tid = *ptr; +#endif + } + return _tid; +} + +namespace { +static constexpr size_t kMaxThreadNameLength = 16; + +static constexpr const char* basename(const char* s, int off = 0) { + return !s[off] + ? s + : s[off] == '/' ? basename(&s[off + 1]) : basename(s, off + 1); +} +#if defined(_MSC_VER) +void *getKernel32Func(const char* procName) { + return GetProcAddress(GetModuleHandleA("KERNEL32.DLL"), procName); +} +#endif +} + +bool setThreadName(const std::string& name) { +#ifdef __APPLE__ + return 0 == pthread_setname_np(name.c_str()); +#elif defined _MSC_VER + // Per https://docs.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-setthreaddescription + // Use runtime linking to set thread description + static auto _SetThreadDescription = reinterpret_cast(getKernel32Func("SetThreadDescription")); + if (!_SetThreadDescription) { + return false; + } + std::wstring_convert> conv; + std::wstring wname = conv.from_bytes(name); + HRESULT hr = _SetThreadDescription(GetCurrentThread(), wname.c_str()); + return SUCCEEDED(hr); +#else + return 0 == pthread_setname_np(pthread_self(), name.c_str()); +#endif +} + +std::string getThreadName() { +#ifndef _MSC_VER + char buf[kMaxThreadNameLength] = ""; + if ( +#ifndef __ANDROID__ + pthread_getname_np(pthread_self(), buf, kMaxThreadNameLength) != 0 +#else + prctl(PR_GET_NAME, buf, kMaxThreadNameLength) != 0 +#endif + ) { + return "Unknown"; + } + return buf; +#else // _MSC_VER + static auto _GetThreadDescription = reinterpret_cast(getKernel32Func("GetThreadDescription")); + if (!_GetThreadDescription) { + return "Unknown"; + } + PWSTR data; + HRESULT hr = _GetThreadDescription(GetCurrentThread(), &data); + if (!SUCCEEDED(hr)) { + return ""; + } + std::wstring_convert> conv; + std::string name = conv.to_bytes(data); + LocalFree(data); + return name; +#endif +} + +// Linux: +// Extract process name from /proc/pid/cmdline. This does not have +// the 16 character limit that /proc/pid/status and /prod/pid/comm has. +std::string processName(int32_t pid) { +#ifdef __linux__ + FILE* cmdfile = fopen(fmt::format("/proc/{}/cmdline", pid).c_str(), "r"); + if (cmdfile != nullptr) { + char* command = nullptr; + int scanned = fscanf(cmdfile, "%ms", &command); + fclose(cmdfile); + if (scanned > 0 && command) { + std::string ret(basename(command)); + free(command); + return ret; + } + } + std::cerr << "Failed to read process name for pid " << pid << std::endl; +#endif + return ""; +} + +// Max number of parent pids to collect, just for extra safeguarding. +constexpr int kMaxParentPids = 10; + +// Return a pair of +static std::pair parentPidAndCommand(int32_t pid) { +#ifdef __linux__ + FILE* statfile = fopen(fmt::format("/proc/{}/stat", pid).c_str(), "r"); + if (statfile == nullptr) { + return std::make_pair(0, ""); + } + int32_t parent_pid; + char* command = nullptr; + int scanned = fscanf(statfile, "%*d (%m[^)]) %*c %d", &command, &parent_pid); + fclose(statfile); + std::pair ret; + if (scanned == 2) { + ret = std::make_pair(parent_pid, std::string(command)); + } else { + std::cerr << "Failed to parse /proc/" << pid << "/stat" << std::endl; + ret = std::make_pair(0, ""); + } + + // The 'm' character in the format tells fscanf to allocate memory + // for the parsed string, which we need to free here. + free(command); + return ret; +#else + return std::make_pair(0, ""); +#endif +} + +std::vector> pidCommandPairsOfAncestors() { + std::vector> pairs; + pairs.reserve(kMaxParentPids + 1); + int32_t curr_pid = processId(); + for (int i = 0; i <= kMaxParentPids && curr_pid > 1; i++) { + std::pair ppid_and_comm = parentPidAndCommand(curr_pid); + pairs.push_back(std::make_pair(curr_pid, ppid_and_comm.second)); + curr_pid = ppid_and_comm.first; + } + return pairs; +} + +} // namespace libkineto diff --git a/tb_plugins/profiling/libkineto/src/WeakSymbols.cpp b/tb_plugins/profiling/libkineto/src/WeakSymbols.cpp new file mode 100644 index 0000000000000000000000000000000000000000..540a5ac8f97c8f38c7ee3d31ea285a3ab7c9f375 --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/WeakSymbols.cpp @@ -0,0 +1,12 @@ +#include + +#ifndef _MSC_VER +extern "C" { +// This function is needed to avoid superfluous dependency on GNU OpenMP library when cuPTI is linked statically +// For more details see https://github.com/pytorch/pytorch/issues/51026 +__attribute__((weak)) int acc_get_device_type() { + throw std::runtime_error("Dummy implementation of acc_get_device_type is not supposed to be called!"); +} + +} // extern "C" +#endif diff --git a/tb_plugins/profiling/libkineto/src/cupti_call.h b/tb_plugins/profiling/libkineto/src/cupti_call.h new file mode 100644 index 0000000000000000000000000000000000000000..fd6ebae7691ed607867db5717248ba22f4efa5c0 --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/cupti_call.h @@ -0,0 +1,33 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#include + +#ifdef HAS_CUPTI + +#include + +#define CUPTI_CALL(call) \ + [&]() -> CUptiResult { \ + CUptiResult _status_ = call; \ + if (_status_ != CUPTI_SUCCESS) { \ + const char* _errstr_ = nullptr; \ + cuptiGetResultString(_status_, &_errstr_); \ + LOG(WARNING) << fmt::format( \ + "function {} failed with error {} ({})", \ + #call, \ + _errstr_, \ + (int)_status_); \ + } \ + return _status_; \ + }() + +#define CUPTI_CALL_NOWARN(call) call + +#else + +#define CUPTI_CALL(call) call +#define CUPTI_CALL_NOWARN(call) call + +#endif // HAS_CUPTI diff --git a/tb_plugins/profiling/libkineto/src/cupti_strings.cpp b/tb_plugins/profiling/libkineto/src/cupti_strings.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4535273a277e04b0b6f98b539df82955ef62468f --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/cupti_strings.cpp @@ -0,0 +1,502 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include "cupti_strings.h" + +namespace libkineto { + +const char* memcpyKindString( + CUpti_ActivityMemcpyKind kind) { + switch (kind) { + case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD: + return "HtoD"; + case CUPTI_ACTIVITY_MEMCPY_KIND_DTOH: + return "DtoH"; + case CUPTI_ACTIVITY_MEMCPY_KIND_HTOA: + return "HtoA"; + case CUPTI_ACTIVITY_MEMCPY_KIND_ATOH: + return "AtoH"; + case CUPTI_ACTIVITY_MEMCPY_KIND_ATOA: + return "AtoA"; + case CUPTI_ACTIVITY_MEMCPY_KIND_ATOD: + return "AtoD"; + case CUPTI_ACTIVITY_MEMCPY_KIND_DTOA: + return "DtoA"; + case CUPTI_ACTIVITY_MEMCPY_KIND_DTOD: + return "DtoD"; + case CUPTI_ACTIVITY_MEMCPY_KIND_HTOH: + return "HtoH"; + case CUPTI_ACTIVITY_MEMCPY_KIND_PTOP: + return "PtoP"; + default: + break; + } + return ""; +} + +const char* memoryKindString( + CUpti_ActivityMemoryKind kind) { + switch (kind) { + case CUPTI_ACTIVITY_MEMORY_KIND_UNKNOWN: + return "Unknown"; + case CUPTI_ACTIVITY_MEMORY_KIND_PAGEABLE: + return "Pageable"; + case CUPTI_ACTIVITY_MEMORY_KIND_PINNED: + return "Pinned"; + case CUPTI_ACTIVITY_MEMORY_KIND_DEVICE: + return "Device"; + case CUPTI_ACTIVITY_MEMORY_KIND_ARRAY: + return "Array"; + case CUPTI_ACTIVITY_MEMORY_KIND_MANAGED: + return "Managed"; + case CUPTI_ACTIVITY_MEMORY_KIND_DEVICE_STATIC: + return "Device Static"; + case CUPTI_ACTIVITY_MEMORY_KIND_MANAGED_STATIC: + return "Managed Static"; + case CUPTI_ACTIVITY_MEMORY_KIND_FORCE_INT: + return "Force Int"; + default: + return "Unrecognized"; + } +} + +const char* overheadKindString( + CUpti_ActivityOverheadKind kind) { + switch (kind) { + case CUPTI_ACTIVITY_OVERHEAD_UNKNOWN: + return "Unknown"; + case CUPTI_ACTIVITY_OVERHEAD_DRIVER_COMPILER: + return "Driver Compiler"; + case CUPTI_ACTIVITY_OVERHEAD_CUPTI_BUFFER_FLUSH: + return "Buffer Flush"; + case CUPTI_ACTIVITY_OVERHEAD_CUPTI_INSTRUMENTATION: + return "Instrumentation"; + case CUPTI_ACTIVITY_OVERHEAD_CUPTI_RESOURCE: + return "Resource"; + case CUPTI_ACTIVITY_OVERHEAD_FORCE_INT: + return "Force Int"; + default: + return "Unrecognized"; + } +} + + + +static const char* runtimeCbidNames[] = { + "INVALID", + "cudaDriverGetVersion", + "cudaRuntimeGetVersion", + "cudaGetDeviceCount", + "cudaGetDeviceProperties", + "cudaChooseDevice", + "cudaGetChannelDesc", + "cudaCreateChannelDesc", + "cudaConfigureCall", + "cudaSetupArgument", + "cudaGetLastError", + "cudaPeekAtLastError", + "cudaGetErrorString", + "cudaLaunch", + "cudaFuncSetCacheConfig", + "cudaFuncGetAttributes", + "cudaSetDevice", + "cudaGetDevice", + "cudaSetValidDevices", + "cudaSetDeviceFlags", + "cudaMalloc", + "cudaMallocPitch", + "cudaFree", + "cudaMallocArray", + "cudaFreeArray", + "cudaMallocHost", + "cudaFreeHost", + "cudaHostAlloc", + "cudaHostGetDevicePointer", + "cudaHostGetFlags", + "cudaMemGetInfo", + "cudaMemcpy", + "cudaMemcpy2D", + "cudaMemcpyToArray", + "cudaMemcpy2DToArray", + "cudaMemcpyFromArray", + "cudaMemcpy2DFromArray", + "cudaMemcpyArrayToArray", + "cudaMemcpy2DArrayToArray", + "cudaMemcpyToSymbol", + "cudaMemcpyFromSymbol", + "cudaMemcpyAsync", + "cudaMemcpyToArrayAsync", + "cudaMemcpyFromArrayAsync", + "cudaMemcpy2DAsync", + "cudaMemcpy2DToArrayAsync", + "cudaMemcpy2DFromArrayAsync", + "cudaMemcpyToSymbolAsync", + "cudaMemcpyFromSymbolAsync", + "cudaMemset", + "cudaMemset2D", + "cudaMemsetAsync", + "cudaMemset2DAsync", + "cudaGetSymbolAddress", + "cudaGetSymbolSize", + "cudaBindTexture", + "cudaBindTexture2D", + "cudaBindTextureToArray", + "cudaUnbindTexture", + "cudaGetTextureAlignmentOffset", + "cudaGetTextureReference", + "cudaBindSurfaceToArray", + "cudaGetSurfaceReference", + "cudaGLSetGLDevice", + "cudaGLRegisterBufferObject", + "cudaGLMapBufferObject", + "cudaGLUnmapBufferObject", + "cudaGLUnregisterBufferObject", + "cudaGLSetBufferObjectMapFlags", + "cudaGLMapBufferObjectAsync", + "cudaGLUnmapBufferObjectAsync", + "cudaWGLGetDevice", + "cudaGraphicsGLRegisterImage", + "cudaGraphicsGLRegisterBuffer", + "cudaGraphicsUnregisterResource", + "cudaGraphicsResourceSetMapFlags", + "cudaGraphicsMapResources", + "cudaGraphicsUnmapResources", + "cudaGraphicsResourceGetMappedPointer", + "cudaGraphicsSubResourceGetMappedArray", + "cudaVDPAUGetDevice", + "cudaVDPAUSetVDPAUDevice", + "cudaGraphicsVDPAURegisterVideoSurface", + "cudaGraphicsVDPAURegisterOutputSurface", + "cudaD3D11GetDevice", + "cudaD3D11GetDevices", + "cudaD3D11SetDirect3DDevice", + "cudaGraphicsD3D11RegisterResource", + "cudaD3D10GetDevice", + "cudaD3D10GetDevices", + "cudaD3D10SetDirect3DDevice", + "cudaGraphicsD3D10RegisterResource", + "cudaD3D10RegisterResource", + "cudaD3D10UnregisterResource", + "cudaD3D10MapResources", + "cudaD3D10UnmapResources", + "cudaD3D10ResourceSetMapFlags", + "cudaD3D10ResourceGetSurfaceDimensions", + "cudaD3D10ResourceGetMappedArray", + "cudaD3D10ResourceGetMappedPointer", + "cudaD3D10ResourceGetMappedSize", + "cudaD3D10ResourceGetMappedPitch", + "cudaD3D9GetDevice", + "cudaD3D9GetDevices", + "cudaD3D9SetDirect3DDevice", + "cudaD3D9GetDirect3DDevice", + "cudaGraphicsD3D9RegisterResource", + "cudaD3D9RegisterResource", + "cudaD3D9UnregisterResource", + "cudaD3D9MapResources", + "cudaD3D9UnmapResources", + "cudaD3D9ResourceSetMapFlags", + "cudaD3D9ResourceGetSurfaceDimensions", + "cudaD3D9ResourceGetMappedArray", + "cudaD3D9ResourceGetMappedPointer", + "cudaD3D9ResourceGetMappedSize", + "cudaD3D9ResourceGetMappedPitch", + "cudaD3D9Begin", + "cudaD3D9End", + "cudaD3D9RegisterVertexBuffer", + "cudaD3D9UnregisterVertexBuffer", + "cudaD3D9MapVertexBuffer", + "cudaD3D9UnmapVertexBuffer", + "cudaThreadExit", + "cudaSetDoubleForDevice", + "cudaSetDoubleForHost", + "cudaThreadSynchronize", + "cudaThreadGetLimit", + "cudaThreadSetLimit", + "cudaStreamCreate", + "cudaStreamDestroy", + "cudaStreamSynchronize", + "cudaStreamQuery", + "cudaEventCreate", + "cudaEventCreateWithFlags", + "cudaEventRecord", + "cudaEventDestroy", + "cudaEventSynchronize", + "cudaEventQuery", + "cudaEventElapsedTime", + "cudaMalloc3D", + "cudaMalloc3DArray", + "cudaMemset3D", + "cudaMemset3DAsync", + "cudaMemcpy3D", + "cudaMemcpy3DAsync", + "cudaThreadSetCacheConfig", + "cudaStreamWaitEvent", + "cudaD3D11GetDirect3DDevice", + "cudaD3D10GetDirect3DDevice", + "cudaThreadGetCacheConfig", + "cudaPointerGetAttributes", + "cudaHostRegister", + "cudaHostUnregister", + "cudaDeviceCanAccessPeer", + "cudaDeviceEnablePeerAccess", + "cudaDeviceDisablePeerAccess", + "cudaPeerRegister", + "cudaPeerUnregister", + "cudaPeerGetDevicePointer", + "cudaMemcpyPeer", + "cudaMemcpyPeerAsync", + "cudaMemcpy3DPeer", + "cudaMemcpy3DPeerAsync", + "cudaDeviceReset", + "cudaDeviceSynchronize", + "cudaDeviceGetLimit", + "cudaDeviceSetLimit", + "cudaDeviceGetCacheConfig", + "cudaDeviceSetCacheConfig", + "cudaProfilerInitialize", + "cudaProfilerStart", + "cudaProfilerStop", + "cudaDeviceGetByPCIBusId", + "cudaDeviceGetPCIBusId", + "cudaGLGetDevices", + "cudaIpcGetEventHandle", + "cudaIpcOpenEventHandle", + "cudaIpcGetMemHandle", + "cudaIpcOpenMemHandle", + "cudaIpcCloseMemHandle", + "cudaArrayGetInfo", + "cudaFuncSetSharedMemConfig", + "cudaDeviceGetSharedMemConfig", + "cudaDeviceSetSharedMemConfig", + "cudaCreateTextureObject", + "cudaDestroyTextureObject", + "cudaGetTextureObjectResourceDesc", + "cudaGetTextureObjectTextureDesc", + "cudaCreateSurfaceObject", + "cudaDestroySurfaceObject", + "cudaGetSurfaceObjectResourceDesc", + "cudaMallocMipmappedArray", + "cudaGetMipmappedArrayLevel", + "cudaFreeMipmappedArray", + "cudaBindTextureToMipmappedArray", + "cudaGraphicsResourceGetMappedMipmappedArray", + "cudaStreamAddCallback", + "cudaStreamCreateWithFlags", + "cudaGetTextureObjectResourceViewDesc", + "cudaDeviceGetAttribute", + "cudaStreamDestroy", + "cudaStreamCreateWithPriority", + "cudaStreamGetPriority", + "cudaStreamGetFlags", + "cudaDeviceGetStreamPriorityRange", + "cudaMallocManaged", + "cudaOccupancyMaxActiveBlocksPerMultiprocessor", + "cudaStreamAttachMemAsync", + "cudaGetErrorName", + "cudaOccupancyMaxActiveBlocksPerMultiprocessor", + "cudaLaunchKernel", + "cudaGetDeviceFlags", + "cudaLaunch_ptsz", + "cudaLaunchKernel_ptsz", + "cudaMemcpy_ptds", + "cudaMemcpy2D_ptds", + "cudaMemcpyToArray_ptds", + "cudaMemcpy2DToArray_ptds", + "cudaMemcpyFromArray_ptds", + "cudaMemcpy2DFromArray_ptds", + "cudaMemcpyArrayToArray_ptds", + "cudaMemcpy2DArrayToArray_ptds", + "cudaMemcpyToSymbol_ptds", + "cudaMemcpyFromSymbol_ptds", + "cudaMemcpyAsync_ptsz", + "cudaMemcpyToArrayAsync_ptsz", + "cudaMemcpyFromArrayAsync_ptsz", + "cudaMemcpy2DAsync_ptsz", + "cudaMemcpy2DToArrayAsync_ptsz", + "cudaMemcpy2DFromArrayAsync_ptsz", + "cudaMemcpyToSymbolAsync_ptsz", + "cudaMemcpyFromSymbolAsync_ptsz", + "cudaMemset_ptds", + "cudaMemset2D_ptds", + "cudaMemsetAsync_ptsz", + "cudaMemset2DAsync_ptsz", + "cudaStreamGetPriority_ptsz", + "cudaStreamGetFlags_ptsz", + "cudaStreamSynchronize_ptsz", + "cudaStreamQuery_ptsz", + "cudaStreamAttachMemAsync_ptsz", + "cudaEventRecord_ptsz", + "cudaMemset3D_ptds", + "cudaMemset3DAsync_ptsz", + "cudaMemcpy3D_ptds", + "cudaMemcpy3DAsync_ptsz", + "cudaStreamWaitEvent_ptsz", + "cudaStreamAddCallback_ptsz", + "cudaMemcpy3DPeer_ptds", + "cudaMemcpy3DPeerAsync_ptsz", + "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", + "cudaMemPrefetchAsync", + "cudaMemPrefetchAsync_ptsz", + "cudaMemAdvise", + "cudaDeviceGetP2PAttribute", + "cudaGraphicsEGLRegisterImage", + "cudaEGLStreamConsumerConnect", + "cudaEGLStreamConsumerDisconnect", + "cudaEGLStreamConsumerAcquireFrame", + "cudaEGLStreamConsumerReleaseFrame", + "cudaEGLStreamProducerConnect", + "cudaEGLStreamProducerDisconnect", + "cudaEGLStreamProducerPresentFrame", + "cudaEGLStreamProducerReturnFrame", + "cudaGraphicsResourceGetMappedEglFrame", + "cudaMemRangeGetAttribute", + "cudaMemRangeGetAttributes", + "cudaEGLStreamConsumerConnectWithFlags", + "cudaLaunchCooperativeKernel", + "cudaLaunchCooperativeKernel_ptsz", + "cudaEventCreateFromEGLSync", + "cudaLaunchCooperativeKernelMultiDevice", + "cudaFuncSetAttribute", + "cudaImportExternalMemory", + "cudaExternalMemoryGetMappedBuffer", + "cudaExternalMemoryGetMappedMipmappedArray", + "cudaDestroyExternalMemory", + "cudaImportExternalSemaphore", + "cudaSignalExternalSemaphoresAsync", + "cudaSignalExternalSemaphoresAsync_ptsz", + "cudaWaitExternalSemaphoresAsync", + "cudaWaitExternalSemaphoresAsync_ptsz", + "cudaDestroyExternalSemaphore", + "cudaLaunchHostFunc", + "cudaLaunchHostFunc_ptsz", + "cudaGraphCreate", + "cudaGraphKernelNodeGetParams", + "cudaGraphKernelNodeSetParams", + "cudaGraphAddKernelNode", + "cudaGraphAddMemcpyNode", + "cudaGraphMemcpyNodeGetParams", + "cudaGraphMemcpyNodeSetParams", + "cudaGraphAddMemsetNode", + "cudaGraphMemsetNodeGetParams", + "cudaGraphMemsetNodeSetParams", + "cudaGraphAddHostNode", + "cudaGraphHostNodeGetParams", + "cudaGraphAddChildGraphNode", + "cudaGraphChildGraphNodeGetGraph", + "cudaGraphAddEmptyNode", + "cudaGraphClone", + "cudaGraphNodeFindInClone", + "cudaGraphNodeGetType", + "cudaGraphGetRootNodes", + "cudaGraphNodeGetDependencies", + "cudaGraphNodeGetDependentNodes", + "cudaGraphAddDependencies", + "cudaGraphRemoveDependencies", + "cudaGraphDestroyNode", + "cudaGraphInstantiate", + "cudaGraphLaunch", + "cudaGraphLaunch_ptsz", + "cudaGraphExecDestroy", + "cudaGraphDestroy", + "cudaStreamBeginCapture", + "cudaStreamBeginCapture_ptsz", + "cudaStreamIsCapturing", + "cudaStreamIsCapturing_ptsz", + "cudaStreamEndCapture", + "cudaStreamEndCapture_ptsz", + "cudaGraphHostNodeSetParams", + "cudaGraphGetNodes", + "cudaGraphGetEdges", + "cudaStreamGetCaptureInfo", + "cudaStreamGetCaptureInfo_ptsz", + "cudaGraphExecKernelNodeSetParams", + "cudaThreadExchangeStreamCaptureMode", + "cudaDeviceGetNvSciSyncAttributes", + "cudaOccupancyAvailableDynamicSMemPerBlock", + "cudaStreamSetFlags", + "cudaStreamSetFlags_ptsz", + "cudaGraphExecMemcpyNodeSetParams", + "cudaGraphExecMemsetNodeSetParams", + "cudaGraphExecHostNodeSetParams", + "cudaGraphExecUpdate", + "cudaGetFuncBySymbol", + "cudaCtxResetPersistingL2Cache", + "cudaGraphKernelNodeCopyAttributes", + "cudaGraphKernelNodeGetAttribute", + "cudaGraphKernelNodeSetAttribute", + "cudaStreamCopyAttributes", + "cudaStreamCopyAttributes_ptsz", + "cudaStreamGetAttribute", + "cudaStreamGetAttribute_ptsz", + "cudaStreamSetAttribute", + "cudaStreamSetAttribute_ptsz", + "cudaDeviceGetTexture1DLinearMaxWidth", + "cudaGraphUpload", + "cudaGraphUpload_ptsz", + "cudaGraphAddMemcpyNodeToSymbol", + "cudaGraphAddMemcpyNodeFromSymbol", + "cudaGraphAddMemcpyNode1D", + "cudaGraphMemcpyNodeSetParamsToSymbol", + "cudaGraphMemcpyNodeSetParamsFromSymbol", + "cudaGraphMemcpyNodeSetParams1D", + "cudaGraphExecMemcpyNodeSetParamsToSymbol", + "cudaGraphExecMemcpyNodeSetParamsFromSymbol", + "cudaGraphExecMemcpyNodeSetParams1D", + "cudaArrayGetSparseProperties", + "cudaMipmappedArrayGetSparseProperties", + "cudaGraphExecChildGraphNodeSetParams", + "cudaGraphAddEventRecordNode", + "cudaGraphEventRecordNodeGetEvent", + "cudaGraphEventRecordNodeSetEvent", + "cudaGraphAddEventWaitNode", + "cudaGraphEventWaitNodeGetEvent", + "cudaGraphEventWaitNodeSetEvent", + "cudaGraphExecEventRecordNodeSetEvent", + "cudaGraphExecEventWaitNodeSetEvent", + "cudaEventRecordWithFlags", + "cudaEventRecordWithFlags_ptsz", + "cudaDeviceGetDefaultMemPool", + "cudaMallocAsync", + "cudaMallocAsync_ptsz", + "cudaFreeAsync", + "cudaFreeAsync_ptsz", + "cudaMemPoolTrimTo", + "cudaMemPoolSetAttribute", + "cudaMemPoolGetAttribute", + "cudaMemPoolSetAccess", + "cudaArrayGetPlane", + "cudaMemPoolGetAccess", + "cudaMemPoolCreate", + "cudaMemPoolDestroy", + "cudaDeviceSetMemPool", + "cudaDeviceGetMemPool", + "cudaMemPoolExportToShareableHandle", + "cudaMemPoolImportFromShareableHandle", + "cudaMemPoolExportPointer", + "cudaMemPoolImportPointer", + "cudaMallocFromPoolAsync", + "cudaMallocFromPoolAsync_ptsz", + "cudaSignalExternalSemaphoresAsync", + "cudaSignalExternalSemaphoresAsync", + "cudaWaitExternalSemaphoresAsync", + "cudaWaitExternalSemaphoresAsync", + "cudaGraphAddExternalSemaphoresSignalNode", + "cudaGraphExternalSemaphoresSignalNodeGetParams", + "cudaGraphExternalSemaphoresSignalNodeSetParams", + "cudaGraphAddExternalSemaphoresWaitNode", + "cudaGraphExternalSemaphoresWaitNodeGetParams", + "cudaGraphExternalSemaphoresWaitNodeSetParams", + "cudaGraphExecExternalSemaphoresSignalNodeSetParams", + "cudaGraphExecExternalSemaphoresWaitNodeSetParams", + "SIZE" +}; + +const char* runtimeCbidName(CUpti_CallbackId cbid) { + constexpr int names_size = + sizeof(runtimeCbidNames) / sizeof(runtimeCbidNames[0]); + if (cbid < 0 || cbid >= names_size) { + return runtimeCbidNames[CUPTI_RUNTIME_TRACE_CBID_INVALID]; + } + return runtimeCbidNames[cbid]; +} + +} // namespace libkineto diff --git a/tb_plugins/profiling/libkineto/src/cupti_strings.h b/tb_plugins/profiling/libkineto/src/cupti_strings.h new file mode 100644 index 0000000000000000000000000000000000000000..bbfebb983648005d8268d9a29d613d369d6a5384 --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/cupti_strings.h @@ -0,0 +1,14 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#include + +namespace libkineto { + +const char* memoryKindString(CUpti_ActivityMemoryKind kind); +const char* memcpyKindString(CUpti_ActivityMemcpyKind kind); +const char* runtimeCbidName(CUpti_CallbackId cbid); +const char* overheadKindString(CUpti_ActivityOverheadKind kind); + +} // namespace libkineto diff --git a/tb_plugins/profiling/libkineto/src/init.cpp b/tb_plugins/profiling/libkineto/src/init.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4e1022485ac5d17b5af1e0676b6a4595a138e1b5 --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/init.cpp @@ -0,0 +1,139 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include +#include + +#include "ActivityProfilerProxy.h" +#include "Config.h" +#ifdef HAS_CUPTI +#include "CuptiCallbackApi.h" +#include "CuptiActivityApi.h" +#include "EventProfilerController.h" +#endif +#include "cupti_call.h" +#include "libkineto.h" + +#include "Logger.h" + +namespace KINETO_NAMESPACE { + +#ifdef HAS_CUPTI +static bool initialized = false; +static std::mutex initMutex; + +static void initProfilers( + CUpti_CallbackDomain /*domain*/, + CUpti_CallbackId /*cbid*/, + const CUpti_CallbackData* cbInfo) { + CUpti_ResourceData* d = (CUpti_ResourceData*)cbInfo; + CUcontext ctx = d->context; + + VLOG(0) << "CUDA Context created"; + std::lock_guard lock(initMutex); + + if (!initialized) { + libkineto::api().initProfilerIfRegistered(); + initialized = true; + VLOG(0) << "libkineto profilers activated"; + } + if (getenv("KINETO_DISABLE_EVENT_PROFILER") != nullptr) { + VLOG(0) << "Event profiler disabled via env var"; + } else { + ConfigLoader& config_loader = libkineto::api().configLoader(); + config_loader.initBaseConfig(); + EventProfilerController::start(ctx, config_loader); + } +} + +// Some models suffer from excessive instrumentation code gen +// on dynamic attach which can hang for more than 5+ seconds. +// If the workload was meant to be traced, preload the CUPTI +// to take the performance hit early on. +// https://docs.nvidia.com/cupti/r_main.html#r_overhead +static bool shouldPreloadCuptiInstrumentation() { + return getenv("PRELOAD_CUPTI_INSTRUMENTATION"); +} + +static void stopProfiler( + CUpti_CallbackDomain /*domain*/, + CUpti_CallbackId /*cbid*/, + const CUpti_CallbackData* cbInfo) { + CUpti_ResourceData* d = (CUpti_ResourceData*)cbInfo; + CUcontext ctx = d->context; + + LOG(INFO) << "CUDA Context destroyed"; + std::lock_guard lock(initMutex); + EventProfilerController::stop(ctx); +} +#endif // HAS_CUPTI + +} // namespace KINETO_NAMESPACE + +// Callback interface with CUPTI and library constructors +using namespace KINETO_NAMESPACE; +extern "C" { + +// Return true if no CUPTI errors occurred during init +bool libkineto_init(bool cpuOnly, bool logOnError) { + bool success = true; +#ifdef HAS_CUPTI + if (!cpuOnly) { + // libcupti will be lazily loaded on this call. + // If it is not available (e.g. CUDA is not installed), + // then this call will return an error and we just abort init. + auto& cbapi = CuptiCallbackApi::singleton(); + bool status = false; + + if (cbapi.initSuccess()){ + const CUpti_CallbackDomain domain = CUPTI_CB_DOMAIN_RESOURCE; + status = cbapi.registerCallback( + domain, CuptiCallbackApi::RESOURCE_CONTEXT_CREATED, initProfilers); + status = status && cbapi.registerCallback( + domain, CuptiCallbackApi::RESOURCE_CONTEXT_DESTROYED, stopProfiler); + + if (status) { + status = cbapi.enableCallback( + domain, CuptiCallbackApi::RESOURCE_CONTEXT_CREATED); + status = status && cbapi.enableCallback( + domain, CuptiCallbackApi::RESOURCE_CONTEXT_DESTROYED); + } + } + + if (!cbapi.initSuccess() || !status) { + success = false; + cpuOnly = true; + if (logOnError) { + CUPTI_CALL(cbapi.getCuptiStatus()); + LOG(WARNING) << "CUPTI initialization failed - " + << "CUDA profiler activities will be missing"; + LOG(INFO) << "If you see CUPTI_ERROR_INSUFFICIENT_PRIVILEGES, refer to " + << "https://developer.nvidia.com/nvidia-development-tools-solutions-err-nvgpuctrperm-cupti"; + } + } + } + + if (shouldPreloadCuptiInstrumentation()) { + CuptiActivityApi::forceLoadCupti(); + } +#endif // HAS_CUPTI + + ConfigLoader& config_loader = libkineto::api().configLoader(); + libkineto::api().registerProfiler( + std::make_unique(cpuOnly, config_loader)); + + return success; +} + +// The cuda driver calls this function if the CUDA_INJECTION64_PATH environment +// variable is set +int InitializeInjection(void) { + LOG(INFO) << "Injection mode: Initializing libkineto"; + libkineto_init(false /*cpuOnly*/, true /*logOnError*/); + return 1; +} + +void suppressLibkinetoLogMessages() { + SET_LOG_SEVERITY_LEVEL(ERROR); +} + +} // extern C diff --git a/tb_plugins/profiling/libkineto/src/libkineto_api.cpp b/tb_plugins/profiling/libkineto/src/libkineto_api.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9a622e4f5e5cfd54848cb8c6dc05b98da2fb6011 --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/libkineto_api.cpp @@ -0,0 +1,41 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include "libkineto.h" + +#include "ConfigLoader.h" +#include "ThreadUtil.h" + +namespace libkineto { + +LibkinetoApi& api() { + static LibkinetoApi instance(ConfigLoader::instance()); + return instance; +} + +void LibkinetoApi::initClientIfRegistered() { + if (client_) { + if (clientRegisterThread_ != threadId()) { + fprintf( + stderr, + "ERROR: External init callback must run in same thread as registerClient " + "(%d != %d)\n", + threadId(), + (int)clientRegisterThread_); + } else { + client_->init(); + } + } +} + +void LibkinetoApi::registerClient(ClientInterface* client) { + client_ = client; + if (client && activityProfiler_) { + // Can initialize straight away + client->init(); + } + // Assume here that the external init callback is *not* threadsafe + // and only call it if it's the same thread that called registerClient + clientRegisterThread_ = threadId(); +} + +} // namespace libkineto diff --git a/tb_plugins/profiling/libkineto/src/output_base.h b/tb_plugins/profiling/libkineto/src/output_base.h new file mode 100644 index 0000000000000000000000000000000000000000..29d0d57768c91b8593f202cea51071a1affcd88d --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/output_base.h @@ -0,0 +1,104 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#include +#include +#include +#include +#include + +#ifdef HAS_CUPTI +#include +#include "CuptiActivity.h" +#endif // HAS_CUPTI +#include "ActivityBuffers.h" +#include "GenericTraceActivity.h" +#include "ThreadUtil.h" +#include "TraceSpan.h" + +namespace KINETO_NAMESPACE { + class Config; + class GpuKernelActivity; + struct RuntimeActivity; +} + +namespace libkineto { + +using namespace KINETO_NAMESPACE; + +class ActivityLogger { + public: + + virtual ~ActivityLogger() = default; + + struct DeviceInfo { + DeviceInfo(int64_t id, const std::string& name, const std::string& label) : + id(id), name(name), label(label) {} + int64_t id; + const std::string name; + const std::string label; + }; + + struct ResourceInfo { + ResourceInfo( + int64_t deviceId, + int64_t id, + int64_t sortIndex, + const std::string& name) : + id(id), sortIndex(sortIndex), deviceId(deviceId), name(name) {} + int64_t id; + int64_t sortIndex; + int64_t deviceId; + const std::string name; + }; + + struct OverheadInfo { + explicit OverheadInfo(const std::string& name) : name(name) {} + const std::string name; + }; + + virtual void handleDeviceInfo( + const DeviceInfo& info, + uint64_t time) = 0; + + virtual void handleResourceInfo(const ResourceInfo& info, int64_t time) = 0; + + virtual void handleOverheadInfo(const OverheadInfo& info, int64_t time) = 0; + + virtual void handleTraceSpan(const TraceSpan& span) = 0; + + virtual void handleActivity( + const libkineto::ITraceActivity& activity) = 0; + virtual void handleGenericActivity( + const libkineto::GenericTraceActivity& activity) = 0; + +#ifdef HAS_CUPTI + virtual void handleGpuActivity( + const GpuActivity& activity) = 0; + virtual void handleGpuActivity( + const GpuActivity& activity) = 0; + virtual void handleGpuActivity( + const GpuActivity& activity) = 0; + virtual void handleGpuActivity( + const GpuActivity& activity) = 0; +#endif // HAS_CUPTI + + virtual void handleTraceStart( + const std::unordered_map& metadata) = 0; + + void handleTraceStart() { + handleTraceStart(std::unordered_map()); + } + + virtual void finalizeTrace( + const KINETO_NAMESPACE::Config& config, + std::unique_ptr buffers, + int64_t endTime, + std::unordered_map>& metadata) = 0; + + protected: + ActivityLogger() = default; +}; + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/output_csv.cpp b/tb_plugins/profiling/libkineto/src/output_csv.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e56c02293982745ed0c013b83bd04d9f42ea7305 --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/output_csv.cpp @@ -0,0 +1,88 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include "output_csv.h" + +#include +#include +#include + +#include +#include + +#include "Config.h" +#include "Logger.h" + +namespace KINETO_NAMESPACE { + +static void write_header( + std::ostream& out, + const std::vector& percentiles) { + out << "timestamp,delta_ms,device,event_name"; + for (int p : percentiles) { + out << ",p" << p; + } + out << ",total" << std::endl; +} + +void EventCSVLogger::update(const Config& config) { + eventNames_.clear(); + eventNames_.insert(config.eventNames().begin(), config.eventNames().end()); + eventNames_.insert(config.metricNames().begin(), config.metricNames().end()); + if (config.percentiles() != percentiles_) { + percentiles_ = config.percentiles(); + if (out_) { + write_header(*out_, percentiles_); + } + } +} + +void EventCSVLogger::handleSample(int device, const Sample& sample, bool from_new_version) { + using namespace std::chrono; + if (out_) { + auto now = system_clock::now(); + auto time = system_clock::to_time_t(now); + for (const Stat& s : sample.stats) { + if (eventNames_.find(s.name) == eventNames_.end()) { + continue; + } + *out_ << fmt::format("{:%Y-%m-%d %H:%M:%S}", fmt::localtime(time)) << ","; + *out_ << sample.deltaMsec << ","; + *out_ << device << ","; + *out_ << s.name; + for (const auto& p : s.percentileValues) { + *out_ << "," << p.second; + } + *out_ << "," << s.total << std::endl; + } + } +} + +void EventCSVFileLogger::update(const Config& config) { + if (config.eventLogFile() != filename_) { + if (of_.is_open()) { + of_.close(); + out_ = nullptr; + percentiles_.clear(); + } + filename_ = config.eventLogFile(); + if (!filename_.empty()) { + of_.open(filename_, std::ios::out | std::ios::trunc); + out_ = &of_; + } + } + EventCSVLogger::update(config); +} + +void EventCSVDbgLogger::update(const Config& config) { + if (out_ && config.verboseLogLevel() < 0) { + out_ = nullptr; + } else if (!out_ && config.verboseLogLevel() >= 0) { + out_ = &LIBKINETO_DBG_STREAM; + } + if (config.verboseLogLevel() >= 0) { + percentiles_.clear(); + EventCSVLogger::update(config); + } +} + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/output_csv.h b/tb_plugins/profiling/libkineto/src/output_csv.h new file mode 100644 index 0000000000000000000000000000000000000000..bca29f4db99af8aedf031aed869ff2efd3df6155 --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/output_csv.h @@ -0,0 +1,39 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once +#include "SampleListener.h" + +#include +#include +#include + +namespace KINETO_NAMESPACE { + +class EventCSVLogger : public SampleListener { + public: + void update(const Config& config) override; + void handleSample(int device, const Sample& sample, bool from_new_version) override; + + protected: + EventCSVLogger() : out_(nullptr) {} + + std::ostream* out_; + std::set eventNames_; + std::vector percentiles_; +}; + +class EventCSVFileLogger : public EventCSVLogger { + public: + void update(const Config& config) override; + + private: + std::ofstream of_; + std::string filename_; +}; + +class EventCSVDbgLogger : public EventCSVLogger { + public: + void update(const Config& config) override; +}; + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/output_json.cpp b/tb_plugins/profiling/libkineto/src/output_json.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0ef22339fad15d6a78e43d7fcb7761fbbc97333b --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/output_json.cpp @@ -0,0 +1,583 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include "output_json.h" + +#include +#include +#include +#include + +#include "Config.h" +#ifdef HAS_CUPTI +#include "CuptiActivity.h" +#include "CuptiActivity.tpp" +#include "CuptiActivityApi.h" +#include "CudaDeviceProperties.h" +#endif // HAS_CUPTI +#include "Demangle.h" +#include "TraceSpan.h" + +#include "Logger.h" + +using std::endl; +using namespace libkineto; + +namespace KINETO_NAMESPACE { + +static constexpr int kSchemaVersion = 1; +static constexpr char kFlowStart = 's'; +static constexpr char kFlowEnd = 'f'; + +#ifdef __linux__ +static constexpr char kDefaultLogFileFmt[] = + "/tmp/libkineto_activities_{}.json"; +#else +static constexpr char kDefaultLogFileFmt[] = "libkineto_activities_{}.json"; +#endif + +std::string& ChromeTraceLogger::sanitizeStrForJSON(std::string& value) { +// Replace all backslashes with forward slash because Windows paths causing JSONDecodeError. +#ifdef _WIN32 + std::replace(value.begin(), value.end(), '\\', '/'); +#endif + return value; +} + +void ChromeTraceLogger::metadataToJSON( + const std::unordered_map& metadata) { + for (const auto& kv : metadata) { + traceOf_ << fmt::format(R"JSON( + "{}": {},)JSON", kv.first, kv.second); + } +} + +void ChromeTraceLogger::handleTraceStart( + const std::unordered_map& metadata) { + traceOf_ << fmt::format(R"JSON( +{{ + "schemaVersion": {},)JSON", kSchemaVersion); + +#ifdef HAS_CUPTI + traceOf_ << fmt::format(R"JSON( + "deviceProperties": [{} + ],)JSON", devicePropertiesJson()); +#endif + + metadataToJSON(metadata); + traceOf_ << R"JSON( + "traceEvents": [)JSON"; +} + +static std::string defaultFileName() { + return fmt::format(kDefaultLogFileFmt, processId()); +} + +void ChromeTraceLogger::openTraceFile() { + traceOf_.open(fileName_, std::ofstream::out | std::ofstream::trunc); + if (!traceOf_) { + PLOG(ERROR) << "Failed to open '" << fileName_ << "'"; + } else { + LOG(INFO) << "Tracing to " << fileName_; + } +} + +ChromeTraceLogger::ChromeTraceLogger(const std::string& traceFileName) { + fileName_ = traceFileName.empty() ? defaultFileName() : traceFileName; + traceOf_.clear(std::ios_base::badbit); + openTraceFile(); +} + +static int64_t us(int64_t timestamp) { + // It's important that this conversion is the same here and in the CPU trace. + // No rounding! + return timestamp / 1000; +} + +void ChromeTraceLogger::handleDeviceInfo( + const DeviceInfo& info, + uint64_t time) { + if (!traceOf_) { + return; + } + + // M is for metadata + // process_name needs a pid and a name arg + // clang-format off + traceOf_ << fmt::format(R"JSON( + {{ + "name": "process_name", "ph": "M", "ts": {}, "pid": {}, "tid": 0, + "args": {{ + "name": "{}" + }} + }}, + {{ + "name": "process_labels", "ph": "M", "ts": {}, "pid": {}, "tid": 0, + "args": {{ + "labels": "{}" + }} + }}, + {{ + "name": "process_sort_index", "ph": "M", "ts": {}, "pid": {}, "tid": 0, + "args": {{ + "sort_index": {} + }} + }},)JSON", + time, info.id, + info.name, + time, info.id, + info.label, + time, info.id, + info.id < 8 ? info.id + 0x1000000ll : info.id); + // clang-format on +} + +void ChromeTraceLogger::handleResourceInfo( + const ResourceInfo& info, + int64_t time) { + if (!traceOf_) { + return; + } + + // M is for metadata + // thread_name needs a pid and a name arg + // clang-format off + traceOf_ << fmt::format(R"JSON( + {{ + "name": "thread_name", "ph": "M", "ts": {}, "pid": {}, "tid": {}, + "args": {{ + "name": "{}" + }} + }}, + {{ + "name": "thread_sort_index", "ph": "M", "ts": {}, "pid": {}, "tid": {}, + "args": {{ + "sort_index": {} + }} + }},)JSON", + time, info.deviceId, info.id, + info.name, + time, info.deviceId, info.id, + info.sortIndex); + // clang-format on +} + +void ChromeTraceLogger::handleOverheadInfo( + const OverheadInfo& info, + int64_t time) { + if (!traceOf_) { + return; + } + + // TOOD: reserve pid = -1 for overhead but we need to rethink how to scale this for + // other metadata + // clang-format off + traceOf_ << fmt::format(R"JSON( + {{ + "name": "process_name", "ph": "M", "ts": {}, "pid": -1, "tid": 0, + "args": {{ + "name": "{}" + }} + }}, + {{ + "name": "process_sort_index", "ph": "M", "ts": {}, "pid": -1, "tid": 0, + "args": {{ + "sort_index": {} + }} + }},)JSON", + time, + info.name, + time, + 0x100000All); + // clang-format on +} + +void ChromeTraceLogger::handleTraceSpan(const TraceSpan& span) { + if (!traceOf_) { + return; + } + + // clang-format off + traceOf_ << fmt::format(R"JSON( + {{ + "ph": "X", "cat": "Trace", "ts": {}, "dur": {}, + "pid": "Spans", "tid": "{}", + "name": "{}{} ({})", + "args": {{ + "Op count": {} + }} + }}, + {{ + "name": "process_sort_index", "ph": "M", "ts": {}, + "pid": "Spans", "tid": 0, + "args": {{ + "sort_index": {} + }} + }},)JSON", + span.startTime, span.endTime - span.startTime, + span.name, + span.prefix, span.name, span.iteration, + span.opCount, + span.startTime, + // Large sort index to appear at the bottom + 0x20000000ll); + // clang-format on + + addIterationMarker(span); +} + +void ChromeTraceLogger::addIterationMarker(const TraceSpan& span) { + if (!traceOf_) { + return; + } + + // clang-format off + traceOf_ << fmt::format(R"JSON( + {{ + "name": "Iteration Start: {}", "ph": "i", "s": "g", + "pid": "Traces", "tid": "Trace {}", "ts": {} + }},)JSON", + span.name, + span.name, span.startTime); + // clang-format on +} + +static std::string traceActivityJson(const ITraceActivity& activity) { + // clang-format off + int64_t ts = activity.timestamp(); + int64_t duration = activity.duration(); + if (activity.type() == ActivityType::GPU_USER_ANNOTATION) { + // The GPU user annotations start at the same time as the + // first associated GPU activity. Since they appear later + // in the trace file, this causes a visualization issue in Chrome. + // Make it start one us earlier. + ts--; + duration++; // Still need it to end at the orginal point + } + return fmt::format(R"JSON( + "name": "{}", "pid": {}, "tid": {}, + "ts": {}, "dur": {})JSON", + activity.name(), activity.deviceId(), activity.resourceId(), + ts, duration); + // clang-format on +} + +void ChromeTraceLogger::handleGenericInstantEvent( + const libkineto::ITraceActivity& op) { + if (!traceOf_) { + return; + } + + traceOf_ << fmt::format(R"JSON( + {{ + "ph": "i", "s": "t", "name": "{}", + "pid": {}, "tid": {}, + "ts": {}, + "args": {{ + {} + }} + }},)JSON", + op.name(), op.deviceId(), op.resourceId(), + op.timestamp(), op.metadataJson()); +} + +void ChromeTraceLogger::handleActivity( + const libkineto::ITraceActivity& op) { + if (!traceOf_) { + return; + } + + if (op.type() == ActivityType::CPU_INSTANT_EVENT) { + handleGenericInstantEvent(op); + return; + } + + const std::string op_metadata = op.metadataJson(); + std::string separator = ""; + if (op_metadata.find_first_not_of(" \t\n") != std::string::npos) { + separator = ",\n "; + } + std::string span = ""; + if (op.traceSpan()) { + span = fmt::format(R"JSON( + "Trace name": "{}", "Trace iteration": {},)JSON", + op.traceSpan()->name, + op.traceSpan()->iteration); + } + + // clang-format off + traceOf_ << fmt::format(R"JSON( + {{ + "ph": "X", "cat": "{}", {}, + "args": {{{} + "External id": {}{}{} + }} + }},)JSON", + toString(op.type()), traceActivityJson(op), + // args + span, + op.correlationId(), separator, op_metadata); + // clang-format on + if (op.flowId() > 0) { + handleGenericLink(op); + } +} + +void ChromeTraceLogger::handleGenericActivity( + const libkineto::GenericTraceActivity& op) { + handleActivity(op); +} + +void ChromeTraceLogger::handleGenericLink(const ITraceActivity& act) { + static struct { + int type; + char longName[24]; + char shortName[16]; + } flow_names[] = { + {kLinkFwdBwd, "forward_backward", "fwd_bwd"}, + {kLinkAsyncCpuGpu, "async_cpu_to_gpu", "async_gpu"} + }; + for (auto& flow : flow_names) { + if (act.flowType() == flow.type) { + // Link the activities via flow ID in source and destination. + // The source node must return true from flowStart() + // and the destination node false. + if (act.flowStart()) { + handleLink(kFlowStart, act, act.flowId(), flow.longName, flow.shortName); + } else { + handleLink(kFlowEnd, act, act.flowId(), flow.longName, flow.shortName); + } + return; + } + } + LOG(ERROR) << "Unknown flow type: " << act.flowType(); +} + +void ChromeTraceLogger::handleLink( + char type, + const ITraceActivity& e, + int64_t id, + const std::string& cat, + const std::string& name) { + if (!traceOf_) { + return; + } + + // clang-format off + traceOf_ << fmt::format(R"JSON( + {{ + "ph": "{}", "id": {}, "pid": {}, "tid": {}, "ts": {}, + "cat": "{}", "name": "{}", "bp": "e" + }},)JSON", + type, id, e.deviceId(), e.resourceId(), e.timestamp(), cat, name); + // clang-format on +} + +#ifdef HAS_CUPTI +// GPU side kernel activity +void ChromeTraceLogger::handleGpuActivity( + const GpuActivity& activity) { + if (!traceOf_) { + return; + } + const CUpti_ActivityKernel4* kernel = &activity.raw(); + constexpr int threads_per_warp = 32; + float blocks_per_sm = -1.0; + float warps_per_sm = -1.0; + int sm_count = smCount(kernel->deviceId); + if (sm_count) { + blocks_per_sm = + (kernel->gridX * kernel->gridY * kernel->gridZ) / (float) sm_count; + warps_per_sm = + blocks_per_sm * (kernel->blockX * kernel->blockY * kernel->blockZ) + / threads_per_warp; + } + + // Calculate occupancy + float occupancy = KINETO_NAMESPACE::kernelOccupancy( + kernel->deviceId, + kernel->registersPerThread, + kernel->staticSharedMemory, + kernel->dynamicSharedMemory, + kernel->blockX, + kernel->blockY, + kernel->blockZ, + blocks_per_sm); + + // clang-format off + traceOf_ << fmt::format(R"JSON( + {{ + "ph": "X", "cat": "Kernel", {}, + "args": {{ + "queued": {}, "device": {}, "context": {}, + "stream": {}, "correlation": {}, + "registers per thread": {}, + "shared memory": {}, + "blocks per SM": {}, + "warps per SM": {}, + "grid": [{}, {}, {}], + "block": [{}, {}, {}], + "est. achieved occupancy %": {} + }} + }},)JSON", + traceActivityJson(activity), + // args + us(kernel->queued), kernel->deviceId, kernel->contextId, + kernel->streamId, kernel->correlationId, + kernel->registersPerThread, + kernel->staticSharedMemory + kernel->dynamicSharedMemory, + blocks_per_sm, + warps_per_sm, + kernel->gridX, kernel->gridY, kernel->gridZ, + kernel->blockX, kernel->blockY, kernel->blockZ, + (int) (0.5 + occupancy * 100.0)); + // clang-format on + + auto to_id = activity.correlationId(); + handleLink(kFlowEnd, activity, to_id, "async_cpu_to_gpu", "async_gpu"); +} + +static std::string bandwidth(uint64_t bytes, uint64_t duration) { + return duration == 0 ? "\"N/A\"" : fmt::format("{}", bytes * 1.0 / duration); +} + +// GPU side memcpy activity +void ChromeTraceLogger::handleGpuActivity( + const GpuActivity& activity) { + if (!traceOf_) { + return; + } + const CUpti_ActivityMemcpy& memcpy = activity.raw(); + VLOG(2) << memcpy.correlationId << ": MEMCPY"; + // clang-format off + traceOf_ << fmt::format(R"JSON( + {{ + "ph": "X", "cat": "Memcpy", {}, + "args": {{ + "device": {}, "context": {}, + "stream": {}, "correlation": {}, + "bytes": {}, "memory bandwidth (GB/s)": {} + }} + }},)JSON", + traceActivityJson(activity), + // args + memcpy.deviceId, memcpy.contextId, + memcpy.streamId, memcpy.correlationId, + memcpy.bytes, bandwidth(memcpy.bytes, memcpy.end - memcpy.start)); + // clang-format on + + int64_t to_id = activity.correlationId(); + handleLink(kFlowEnd, activity, to_id, "async_cpu_to_gpu", "async_gpu"); +} + +// GPU side memcpy activity +void ChromeTraceLogger::handleGpuActivity( + const GpuActivity& activity) { + if (!traceOf_) { + return; + } + const CUpti_ActivityMemcpy2& memcpy = activity.raw(); + // clang-format off + traceOf_ << fmt::format(R"JSON( + {{ + "ph": "X", "cat": "Memcpy", {}, + "args": {{ + "fromDevice": {}, "inDevice": {}, "toDevice": {}, + "fromContext": {}, "inContext": {}, "toContext": {}, + "stream": {}, "correlation": {}, + "bytes": {}, "memory bandwidth (GB/s)": {} + }} + }},)JSON", + traceActivityJson(activity), + // args + memcpy.srcDeviceId, memcpy.deviceId, memcpy.dstDeviceId, + memcpy.srcContextId, memcpy.contextId, memcpy.dstContextId, + memcpy.streamId, memcpy.correlationId, + memcpy.bytes, bandwidth(memcpy.bytes, memcpy.end - memcpy.start)); + // clang-format on + + int64_t to_id = activity.correlationId(); + handleLink(kFlowEnd, activity, to_id, "async_cpu_to_gpu", "async_gpu"); +} + +void ChromeTraceLogger::handleGpuActivity( + const GpuActivity& activity) { + if (!traceOf_) { + return; + } + const CUpti_ActivityMemset& memset = activity.raw(); + // clang-format off + traceOf_ << fmt::format(R"JSON( + {{ + "ph": "X", "cat": "Memset", {}, + "args": {{ + "device": {}, "context": {}, + "stream": {}, "correlation": {}, + "bytes": {}, "memory bandwidth (GB/s)": {} + }} + }},)JSON", + traceActivityJson(activity), + // args + memset.deviceId, memset.contextId, + memset.streamId, memset.correlationId, + memset.bytes, bandwidth(memset.bytes, memset.end - memset.start)); + // clang-format on + + int64_t to_id = activity.correlationId(); + handleLink(kFlowEnd, activity, to_id, "async_cpu_to_gpu", "async_gpu"); +} +#endif // HAS_CUPTI + +void ChromeTraceLogger::finalizeTrace( + const Config& /*unused*/, + std::unique_ptr /*unused*/, + int64_t endTime, + std::unordered_map>& metadata) { + if (!traceOf_) { + LOG(ERROR) << "Failed to write to log file!"; + return; + } + LOG(INFO) << "Chrome Trace written to " << fileName_; + // clang-format off + traceOf_ << fmt::format(R"JSON( + {{ + "name": "Record Window End", "ph": "i", "s": "g", + "pid": "", "tid": "", "ts": {} + }} + ],)JSON", + endTime); + +#if !USE_GOOGLE_LOG + std::unordered_map PreparedMetadata; + for (const auto& kv : metadata) { + // Skip empty log buckets, ex. skip ERROR if its empty. + if (!kv.second.empty()) { + std::string value = "["; + // Ex. Each metadata from logger is a list of strings, expressed in JSON as + // "ERROR": ["Error 1", "Error 2"], + // "WARNING": ["Warning 1", "Warning 2", "Warning 3"], + // ... + int mdv_count = kv.second.size(); + for (const auto& v : kv.second) { + value.append("\"" + v + "\""); + if(mdv_count > 1) { + value.append(","); + mdv_count--; + } + } + value.append("]"); + PreparedMetadata[kv.first] = sanitizeStrForJSON(value); + } + } + metadataToJSON(PreparedMetadata); +#endif // !USE_GOOGLE_LOG + + // Putting this here because the last entry MUST not end with a comma. + traceOf_ << fmt::format(R"JSON( + "traceName": "{}" +}})JSON", sanitizeStrForJSON(fileName_)); + // clang-format on + + traceOf_.close(); +} + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/output_json.h b/tb_plugins/profiling/libkineto/src/output_json.h new file mode 100644 index 0000000000000000000000000000000000000000..5a8a81e4a9fdeef09b0e9ace59b964d5ab99b7ad --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/output_json.h @@ -0,0 +1,91 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#include +#include +#include +#include +#include + +#ifdef HAS_CUPTI +#include +#endif +#include "GenericTraceActivity.h" +#include "output_base.h" + +namespace KINETO_NAMESPACE { + // Previous declaration of TraceSpan is struct. Must match the same here. + struct TraceSpan; +} + +namespace KINETO_NAMESPACE { + +class Config; + +class ChromeTraceLogger : public libkineto::ActivityLogger { + public: + explicit ChromeTraceLogger(const std::string& traceFileName); + + // Note: the caller of these functions should handle concurrency + // i.e., we these functions are not thread-safe + void handleDeviceInfo( + const DeviceInfo& info, + uint64_t time) override; + + void handleOverheadInfo(const OverheadInfo& info, int64_t time) override; + + void handleResourceInfo(const ResourceInfo& info, int64_t time) override; + + void handleTraceSpan(const TraceSpan& span) override; + + void handleActivity(const ITraceActivity& activity) override; + void handleGenericActivity(const GenericTraceActivity& activity) override; + +#ifdef HAS_CUPTI + void handleGpuActivity(const GpuActivity& activity) override; + void handleGpuActivity(const GpuActivity& activity) override; + void handleGpuActivity(const GpuActivity& activity) override; + void handleGpuActivity(const GpuActivity& activity) override; +#endif // HAS_CUPTI + + void handleTraceStart( + const std::unordered_map& metadata) override; + + void finalizeTrace( + const Config& config, + std::unique_ptr buffers, + int64_t endTime, + std::unordered_map>& metadata) override; + + std::string traceFileName() const { + return fileName_; + } + + private: + + // Create a flow event (arrow) + void handleLink( + char type, + const ITraceActivity& e, + int64_t id, + const std::string& cat, + const std::string& name); + + void addIterationMarker(const TraceSpan& span); + + void openTraceFile(); + + void handleGenericInstantEvent(const ITraceActivity& op); + + void handleGenericLink(const ITraceActivity& activity); + + void metadataToJSON(const std::unordered_map& metadata); + + std::string& sanitizeStrForJSON(std::string& value); + + std::string fileName_; + std::ofstream traceOf_; +}; + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/src/output_membuf.h b/tb_plugins/profiling/libkineto/src/output_membuf.h new file mode 100644 index 0000000000000000000000000000000000000000..ef6aadeb65728e0e05e454f98b32ccecca229cf4 --- /dev/null +++ b/tb_plugins/profiling/libkineto/src/output_membuf.h @@ -0,0 +1,130 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#include +#include +#include +#include + +#ifdef HAS_CUPTI +#include +#endif + +#include "Config.h" +#include "GenericTraceActivity.h" +#ifdef HAS_CUPTI +#include "CuptiActivity.h" +#include "CuptiActivity.tpp" +#endif // HAS_CUPTI +#include "output_base.h" + +namespace KINETO_NAMESPACE { + +class Config; + +class MemoryTraceLogger : public ActivityLogger { + public: + MemoryTraceLogger(const Config& config) : config_(config.clone()) { + activities_.reserve(100000); + } + + // Note: the caller of these functions should handle concurrency + // i.e., these functions are not thread-safe + void handleDeviceInfo( + const DeviceInfo& info, + uint64_t time) override { + deviceInfoList_.emplace_back(info, time); + } + + void handleResourceInfo(const ResourceInfo& info, int64_t time) override { + resourceInfoList_.emplace_back(info, time); + } + + void handleOverheadInfo(const OverheadInfo& info, int64_t time) override {} + + void handleTraceSpan(const TraceSpan& span) override { + // Handled separately + } + + template + void addActivityWrapper(const T& act) { + wrappers_.push_back(std::make_unique(act)); + activities_.push_back(wrappers_.back().get()); + } + + // Just add the pointer to the list - ownership of the underlying + // objects must be transferred in ActivityBuffers via finalizeTrace + void handleActivity(const ITraceActivity& activity) override { + activities_.push_back(&activity); + } + void handleGenericActivity(const GenericTraceActivity& activity) override { + addActivityWrapper(activity); + } + +#ifdef HAS_CUPTI + void handleGpuActivity(const GpuActivity& activity) override { + addActivityWrapper(activity); + } + void handleGpuActivity(const GpuActivity& activity) override { + addActivityWrapper(activity); + } + void handleGpuActivity(const GpuActivity& activity) override { + addActivityWrapper(activity); + } + void handleGpuActivity(const GpuActivity& activity) override { + addActivityWrapper(activity); + } +#endif // HAS_CUPTI + + void handleTraceStart( + const std::unordered_map& metadata) override { + metadata_ = metadata; + } + + void finalizeTrace( + const Config& config, + std::unique_ptr buffers, + int64_t endTime, + std::unordered_map>& metadata) override { + buffers_ = std::move(buffers); + endTime_ = endTime; + } + + const std::vector* traceActivities() { + return &activities_; + } + + void log(ActivityLogger& logger) { + logger.handleTraceStart(metadata_); + for (auto& activity : activities_) { + activity->log(logger); + } + for (auto& p : deviceInfoList_) { + logger.handleDeviceInfo(p.first, p.second); + } + for (auto& p : resourceInfoList_) { + logger.handleResourceInfo(p.first, p.second); + } + for (auto& cpu_trace_buffer : buffers_->cpu) { + logger.handleTraceSpan(cpu_trace_buffer->span); + } + // Hold on to the buffers + logger.finalizeTrace(*config_, nullptr, endTime_, loggerMetadata_); + } + + private: + + std::unique_ptr config_; + // Optimization: Remove unique_ptr by keeping separate vector per type + std::vector activities_; + std::vector> wrappers_; + std::vector> deviceInfoList_; + std::vector> resourceInfoList_; + std::unique_ptr buffers_; + std::unordered_map metadata_; + std::unordered_map> loggerMetadata_; + int64_t endTime_{0}; +}; + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/test/CMakeLists.txt b/tb_plugins/profiling/libkineto/test/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..ca54460b36cd4ade93918c8512f1309b48552e65 --- /dev/null +++ b/tb_plugins/profiling/libkineto/test/CMakeLists.txt @@ -0,0 +1,3 @@ +cmake_minimum_required(VERSION 3.5 FATAL_ERROR) + +# TODO diff --git a/tb_plugins/profiling/libkineto/test/ConfigTest.cpp b/tb_plugins/profiling/libkineto/test/ConfigTest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..16bc86e751cefdbee1d48aeb79fc849b7d151a18 --- /dev/null +++ b/tb_plugins/profiling/libkineto/test/ConfigTest.cpp @@ -0,0 +1,315 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include "include/Config.h" + +#include +#include +#include +#include + +using namespace std::chrono; +using namespace KINETO_NAMESPACE; + +TEST(ParseTest, Whitespace) { + Config cfg; + // Check that various types of whitespace is ignored + EXPECT_TRUE(cfg.parse("")); + EXPECT_TRUE(cfg.parse(" ")); + EXPECT_TRUE(cfg.parse("\t")); + EXPECT_TRUE(cfg.parse("\n")); + EXPECT_TRUE(cfg.parse(" ")); + EXPECT_TRUE(cfg.parse("\t \n \t\t\n\n")); + // Only the above characters are supported + EXPECT_FALSE(cfg.parse("\r\n")); +} + +TEST(ParseTest, Comment) { + Config cfg; + // Anything following a '#' should be ignored, up to a newline + EXPECT_TRUE(cfg.parse("# comment")); + EXPECT_TRUE(cfg.parse(" # ~!@#$")); + EXPECT_TRUE(cfg.parse("\t#abc")); + EXPECT_TRUE(cfg.parse("###\n##")); + EXPECT_TRUE(cfg.parse("EVENTS=util ##ok")); + EXPECT_TRUE(cfg.parse("EVENTS=util ## EVENTS=instruction")); + // Whatever appears before the comment must be valid format + EXPECT_FALSE(cfg.parse("util ## not ok")); + EXPECT_FALSE(cfg.parse("## ok \n blah # not OK")); + // Check that a comment does not affect config parsing + EXPECT_TRUE(cfg.parse("SAMPLE_PERIOD_MSECS = 1 # Sample every millisecond")); + EXPECT_EQ(cfg.samplePeriod(), milliseconds(1)); +} + +TEST(ParseTest, Format) { + Config cfg; + // The basic format is just "name = value". + // Where both value and name can be almost anything. + // Leading and trailing whitespace should be removed + // for both 'name' and 'value', but internal whitespace is not. + EXPECT_FALSE(cfg.parse("events")); + EXPECT_TRUE(cfg.parse("events=")); + EXPECT_FALSE(cfg.parse("=events=")); + EXPECT_TRUE(cfg.parse("events=1,2,3")); + // Only one setting per line + EXPECT_FALSE(cfg.parse("events = 1,2,3 ; metrics = 4,5,6")); + // Names are case sensitive + EXPECT_TRUE(cfg.parse("EVENTS = 1,2,3 \n metrics = 4,5,6")); + EXPECT_EQ(cfg.eventNames(), std::set({"1", "2", "3"})); + EXPECT_EQ(cfg.metricNames().size(), 0); + // Leading and trailing whitespace removed for event and metric names, + // but not internal. + EXPECT_TRUE( + cfg.parse("EVENTS = 1, 2, 3 \n \tMETRICS\t = \t4,\t5\t,\ts i x ")); + EXPECT_EQ(cfg.eventNames(), std::set({"1", "2", "3"})); + EXPECT_EQ(cfg.metricNames(), std::set({"4", "5", "s i x"})); +} + +TEST(ParseTest, DefaultActivityTypes) { + Config cfg; + cfg.validate(std::chrono::system_clock::now()); + auto all_activities = activityTypes(); + // TODO: introduce optional activities + EXPECT_EQ(cfg.selectedActivityTypes(), + std::set(all_activities.begin(), all_activities.end() - 1)); +} + +TEST(ParseTest, ActivityTypes) { + Config cfg; + EXPECT_FALSE(cfg.parse("ACTIVITY_TYPES")); + EXPECT_TRUE(cfg.parse("ACTIVITY_TYPES=")); + EXPECT_FALSE(cfg.parse("=ACTIVITY_TYPES=")); + + EXPECT_EQ(cfg.selectedActivityTypes(), + std::set({ActivityType::CPU_OP, + ActivityType::CPU_INSTANT_EVENT, + ActivityType::PYTHON_FUNCTION, + ActivityType::USER_ANNOTATION, + ActivityType::GPU_USER_ANNOTATION, + ActivityType::GPU_MEMCPY, + ActivityType::GPU_MEMSET, + ActivityType::CONCURRENT_KERNEL, + ActivityType::EXTERNAL_CORRELATION, + ActivityType::GLOW_RUNTIME, + ActivityType::CUDA_RUNTIME, + ActivityType::CUDA_PROFILER_RANGE})); + + Config cfg2; + EXPECT_TRUE(cfg2.parse("ACTIVITY_TYPES=gpu_memcpy,gpu_MeMsEt,kernel")); + EXPECT_EQ(cfg2.selectedActivityTypes(), + std::set({ActivityType::GPU_MEMCPY, + ActivityType::GPU_MEMSET, + ActivityType::CONCURRENT_KERNEL})); + + EXPECT_TRUE(cfg2.parse("ACTIVITY_TYPES = cuda_Runtime,")); + EXPECT_EQ(cfg2.selectedActivityTypes(), + std::set({ActivityType::CUDA_RUNTIME})); + + // Should throw an exception because incorrect activity name + EXPECT_FALSE(cfg2.parse("ACTIVITY_TYPES = memcopy,cuda_runtime")); + + EXPECT_TRUE(cfg2.parse("ACTIVITY_TYPES = cpu_op")); + EXPECT_EQ(cfg2.selectedActivityTypes(), + std::set({ActivityType::CPU_OP})); +} + +TEST(ParseTest, SamplePeriod) { + Config cfg; + EXPECT_TRUE(cfg.parse("SAMPLE_PERIOD_MSECS=10")); + EXPECT_EQ(cfg.samplePeriod(), milliseconds(10)); + EXPECT_TRUE(cfg.parse("SAMPLE_PERIOD_MSECS=0")); + cfg.validate(std::chrono::system_clock::now()); + // 0 should be adjustd up to 1 + EXPECT_EQ(cfg.samplePeriod(), milliseconds(1)); + // Negative and non-int values should fail + EXPECT_FALSE(cfg.parse("SAMPLE_PERIOD_MSECS=-10")); + EXPECT_FALSE(cfg.parse("SAMPLE_PERIOD_MSECS=1.5")); + EXPECT_FALSE(cfg.parse("SAMPLE_PERIOD_MSECS=")); + EXPECT_FALSE(cfg.parse("SAMPLE_PERIOD_MSECS=string")); + EXPECT_EQ(cfg.samplePeriod(), milliseconds(1)); +} + +TEST(ParseTest, MultiplexPeriod) { + Config cfg; + auto now = std::chrono::system_clock::now(); + + EXPECT_TRUE(cfg.parse("SAMPLE_PERIOD_MSECS=100\nMULTIPLEX_PERIOD_MSECS=100")); + EXPECT_EQ(cfg.multiplexPeriod(), milliseconds(100)); + EXPECT_TRUE(cfg.parse("MULTIPLEX_PERIOD_MSECS = 0")); + cfg.validate(now); + // Adjusted to match sample period + EXPECT_EQ(cfg.multiplexPeriod(), milliseconds(100)); + EXPECT_TRUE(cfg.parse("MULTIPLEX_PERIOD_MSECS \t= \t 750 \n")); + cfg.validate(now); + // Adjusted to match multiple of sample period + EXPECT_EQ(cfg.multiplexPeriod(), milliseconds(800)); + EXPECT_FALSE(cfg.parse("MULTIPLEX_PERIOD_MSECS=-10")); + EXPECT_FALSE(cfg.parse("MULTIPLEX_PERIOD_MSECS=1.5")); + EXPECT_FALSE(cfg.parse("MULTIPLEX_PERIOD_MSECS=")); + EXPECT_FALSE(cfg.parse("MULTIPLEX_PERIOD_MSECS=string")); + // Previous value not affected + EXPECT_EQ(cfg.multiplexPeriod(), milliseconds(800)); +} + +TEST(ParseTest, ReportPeriod) { + Config cfg; + EXPECT_TRUE(cfg.parse("REPORT_PERIOD_SECS=1")); + EXPECT_EQ(cfg.reportPeriod(), seconds(1)); + // Whitespace + EXPECT_TRUE(cfg.parse("REPORT_PERIOD_SECS = \t100")); + EXPECT_EQ(cfg.reportPeriod(), seconds(100)); + // Invalid types + EXPECT_FALSE(cfg.parse("REPORT_PERIOD_SECS=-1")); + EXPECT_EQ(cfg.reportPeriod(), seconds(100)); +} + +TEST(ParseTest, SamplesPerReport) { + Config cfg; + auto now = std::chrono::system_clock::now(); + + EXPECT_TRUE(cfg.parse(R"( + SAMPLE_PERIOD_MSECS = 1000 + REPORT_PERIOD_SECS = 1 + SAMPLES_PER_REPORT = 10)")); + cfg.validate(now); + // Adjusted down to one sample per report + EXPECT_EQ(cfg.samplesPerReport(), 1); + EXPECT_TRUE(cfg.parse(R"( + SAMPLE_PERIOD_MSECS = 1000 + REPORT_PERIOD_SECS = 10 + SAMPLES_PER_REPORT = 10)")); + cfg.validate(now); + // No adjustment needed + EXPECT_EQ(cfg.samplesPerReport(), 10); + EXPECT_TRUE(cfg.parse(R"( + SAMPLE_PERIOD_MSECS = 1000 + REPORT_PERIOD_SECS = 2 + SAMPLES_PER_REPORT = 10)")); + cfg.validate(now); + // Adjusted to 2 samples per report + EXPECT_EQ(cfg.samplesPerReport(), 2); + EXPECT_TRUE(cfg.parse(R"( + SAMPLE_PERIOD_MSECS = 200 + REPORT_PERIOD_SECS = 2 + SAMPLES_PER_REPORT = 10)")); + cfg.validate(now); + // No adjustment needed + EXPECT_EQ(cfg.samplesPerReport(), 10); + EXPECT_TRUE(cfg.parse("SAMPLES_PER_REPORT=0")); + cfg.validate(now); + // Adjusted up to 1 + EXPECT_EQ(cfg.samplesPerReport(), 1); + // Invalid value types + EXPECT_FALSE(cfg.parse("SAMPLES_PER_REPORT=-10")); + EXPECT_FALSE(cfg.parse("SAMPLES_PER_REPORT=1.5")); + EXPECT_EQ(cfg.samplesPerReport(), 1); + + EXPECT_TRUE(cfg.parse(R"( + SAMPLE_PERIOD_MSECS=1000 + MULTIPLEX_PERIOD_MSECS=500 # Must be a multiple of sample period + REPORT_PERIOD_SECS=0 # Must be non-zero multiple of multiplex period + SAMPLES_PER_REPORT=5 # Max report period / multiplex period)")); + cfg.validate(now); + // Multiple adjustments + EXPECT_EQ(cfg.samplePeriod(), milliseconds(1000)); + EXPECT_EQ(cfg.multiplexPeriod(), milliseconds(1000)); + EXPECT_EQ(cfg.reportPeriod(), seconds(1)); + EXPECT_EQ(cfg.samplesPerReport(), 1); +} + +TEST(ParseTest, EnableSigUsr2) { + Config cfg; + EXPECT_TRUE(cfg.parse("ENABLE_SIGUSR2=yes")); + EXPECT_TRUE(cfg.sigUsr2Enabled()); + EXPECT_TRUE(cfg.parse("ENABLE_SIGUSR2=no")); + EXPECT_FALSE(cfg.sigUsr2Enabled()); + EXPECT_TRUE(cfg.parse("ENABLE_SIGUSR2=YES")); + EXPECT_TRUE(cfg.sigUsr2Enabled()); + EXPECT_TRUE(cfg.parse("ENABLE_SIGUSR2=NO")); + EXPECT_FALSE(cfg.sigUsr2Enabled()); + EXPECT_TRUE(cfg.parse("ENABLE_SIGUSR2=Y")); + EXPECT_TRUE(cfg.sigUsr2Enabled()); + EXPECT_TRUE(cfg.parse("ENABLE_SIGUSR2=N")); + EXPECT_FALSE(cfg.sigUsr2Enabled()); + EXPECT_TRUE(cfg.parse("ENABLE_SIGUSR2=T")); + EXPECT_TRUE(cfg.sigUsr2Enabled()); + EXPECT_TRUE(cfg.parse("ENABLE_SIGUSR2=F")); + EXPECT_FALSE(cfg.sigUsr2Enabled()); + EXPECT_TRUE(cfg.parse("ENABLE_SIGUSR2=true")); + EXPECT_TRUE(cfg.sigUsr2Enabled()); + EXPECT_TRUE(cfg.parse("ENABLE_SIGUSR2=false")); + EXPECT_FALSE(cfg.sigUsr2Enabled()); + EXPECT_FALSE(cfg.parse("ENABLE_SIGUSR2= ")); + EXPECT_FALSE(cfg.parse("ENABLE_SIGUSR2=2")); + EXPECT_FALSE(cfg.parse("ENABLE_SIGUSR2=-1")); + EXPECT_FALSE(cfg.parse("ENABLE_SIGUSR2=yep")); +} + +TEST(ParseTest, DeviceMask) { + Config cfg; + // Single device + EXPECT_TRUE(cfg.parse("EVENTS_ENABLED_DEVICES = 0")); + EXPECT_TRUE(cfg.eventProfilerEnabledForDevice(0)); + EXPECT_FALSE(cfg.eventProfilerEnabledForDevice(1)); + + // Two devices, internal whitespace + EXPECT_TRUE(cfg.parse("EVENTS_ENABLED_DEVICES = 1, 2")); + EXPECT_FALSE(cfg.eventProfilerEnabledForDevice(0)); + EXPECT_TRUE(cfg.eventProfilerEnabledForDevice(1)); + EXPECT_TRUE(cfg.eventProfilerEnabledForDevice(2)); + EXPECT_FALSE(cfg.eventProfilerEnabledForDevice(3)); + + // Three devices, check that previous devices are ignored + EXPECT_TRUE(cfg.parse("EVENTS_ENABLED_DEVICES = 0, 2,4")); + EXPECT_TRUE(cfg.eventProfilerEnabledForDevice(0)); + EXPECT_FALSE(cfg.eventProfilerEnabledForDevice(1)); + EXPECT_TRUE(cfg.eventProfilerEnabledForDevice(2)); + EXPECT_FALSE(cfg.eventProfilerEnabledForDevice(3)); + EXPECT_TRUE(cfg.eventProfilerEnabledForDevice(4)); + EXPECT_FALSE(cfg.eventProfilerEnabledForDevice(5)); + + // Repeated numbers have no effect + EXPECT_TRUE(cfg.parse("EVENTS_ENABLED_DEVICES = 0,1,1,1,2,3,2,1,3,7,7,3")); + EXPECT_TRUE(cfg.eventProfilerEnabledForDevice(0)); + EXPECT_TRUE(cfg.eventProfilerEnabledForDevice(1)); + EXPECT_TRUE(cfg.eventProfilerEnabledForDevice(2)); + EXPECT_TRUE(cfg.eventProfilerEnabledForDevice(3)); + EXPECT_FALSE(cfg.eventProfilerEnabledForDevice(4)); + EXPECT_FALSE(cfg.eventProfilerEnabledForDevice(6)); + EXPECT_TRUE(cfg.eventProfilerEnabledForDevice(7)); + + // 8 is larger than the max allowed + EXPECT_FALSE(cfg.parse("EVENTS_ENABLED_DEVICES = 3,8")); + + // 300 cannot be held in an uint8_t + EXPECT_FALSE(cfg.parse("EVENTS_ENABLED_DEVICES = 300")); + + // Various illegal cases + EXPECT_FALSE(cfg.parse("EVENTS_ENABLED_DEVICES = 0,1,two,three")); + EXPECT_FALSE(cfg.parse("EVENTS_ENABLED_DEVICES = 0,1,,2")); + EXPECT_FALSE(cfg.parse("EVENTS_ENABLED_DEVICES = -1")); + EXPECT_FALSE(cfg.parse("EVENTS_ENABLED_DEVICES = 1.0")); +} + +TEST(ParseTest, RequestTime) { + Config cfg; + system_clock::time_point now = system_clock::now(); + int64_t tgood_ms = + duration_cast(now.time_since_epoch()).count(); + EXPECT_TRUE(cfg.parse(fmt::format("REQUEST_TIMESTAMP = {}", tgood_ms))); + + tgood_ms = duration_cast((now - seconds(5)).time_since_epoch()) + .count(); + EXPECT_TRUE(cfg.parse(fmt::format("REQUEST_TIMESTAMP = {}", tgood_ms))); + + int64_t tbad_ms = + duration_cast((now - seconds(20)).time_since_epoch()) + .count(); + EXPECT_FALSE(cfg.parse(fmt::format("REQUEST_TIMESTAMP = {}", tbad_ms))); + + EXPECT_FALSE(cfg.parse("REQUEST_TIMESTAMP = 0")); + EXPECT_FALSE(cfg.parse("REQUEST_TIMESTAMP = -1")); + + tbad_ms = duration_cast((now + seconds(10)).time_since_epoch()) + .count(); + EXPECT_FALSE(cfg.parse(fmt::format("REQUEST_TIMESTAMP = {}", tbad_ms))); +} diff --git a/tb_plugins/profiling/libkineto/test/CuptiActivityProfilerTest.cpp b/tb_plugins/profiling/libkineto/test/CuptiActivityProfilerTest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6e67980ee31a3386580974033201b7acae75d22b --- /dev/null +++ b/tb_plugins/profiling/libkineto/test/CuptiActivityProfilerTest.cpp @@ -0,0 +1,629 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include +#include +#include +#include +#include +#include + +#ifdef __linux__ +#include +#include +#include +#endif + +#include "include/libkineto.h" +#include "include/Config.h" +#include "src/CuptiActivityProfiler.h" +#include "src/ActivityTrace.h" +#include "src/CuptiActivityApi.h" +#include "src/output_base.h" +#include "src/output_json.h" +#include "src/output_membuf.h" + +#include "src/Logger.h" +#include "test/MockActivitySubProfiler.h" + +using namespace std::chrono; +using namespace KINETO_NAMESPACE; + +#define CUDA_LAUNCH_KERNEL CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000 +#define CUDA_MEMCPY CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020 + +namespace { +const TraceSpan& defaultTraceSpan() { + static TraceSpan span(0, 0, "Unknown", ""); + return span; +} +} + +// Provides ability to easily create a few test CPU-side ops +struct MockCpuActivityBuffer : public CpuTraceBuffer { + MockCpuActivityBuffer(int64_t startTime, int64_t endTime) { + span = TraceSpan(startTime, endTime,"Test trace"); + gpuOpCount = 0; + } + + void addOp(std::string name, int64_t startTime, int64_t endTime, int64_t correlation) { + GenericTraceActivity op(span, ActivityType::CPU_OP, name); + op.startTime = startTime; + op.endTime = endTime; + op.resource = systemThreadId(); + op.id = correlation; + activities.push_back(std::move(op)); + span.opCount++; + } +}; + +// Provides ability to easily create a few test CUPTI ops +struct MockCuptiActivityBuffer { + void addCorrelationActivity(int64_t correlation, CUpti_ExternalCorrelationKind externalKind, int64_t externalId) { + auto& act = *(CUpti_ActivityExternalCorrelation*) malloc(sizeof(CUpti_ActivityExternalCorrelation)); + act.kind = CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION; + act.externalId = externalId; + act.externalKind = externalKind; + act.correlationId = correlation; + activities.push_back(reinterpret_cast(&act)); + } + + void addRuntimeActivity( + CUpti_runtime_api_trace_cbid_enum cbid, + int64_t start_us, int64_t end_us, int64_t correlation) { + auto& act = createActivity( + start_us, end_us, correlation); + act.kind = CUPTI_ACTIVITY_KIND_RUNTIME; + act.cbid = cbid; + act.threadId = threadId(); + activities.push_back(reinterpret_cast(&act)); + } + + void addKernelActivity( + int64_t start_us, int64_t end_us, int64_t correlation) { + auto& act = createActivity( + start_us, end_us, correlation); + act.kind = CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL; + act.deviceId = 0; + act.streamId = 1; + act.name = "kernel"; + act.gridX = act.gridY = act.gridZ = 1; + act.blockX = act.blockY = act.blockZ = 1; + activities.push_back(reinterpret_cast(&act)); + } + + void addMemcpyActivity( + int64_t start_us, int64_t end_us, int64_t correlation) { + auto& act = createActivity( + start_us, end_us, correlation); + act.kind = CUPTI_ACTIVITY_KIND_MEMCPY; + act.deviceId = 0; + act.streamId = 2; + act.copyKind = CUPTI_ACTIVITY_MEMCPY_KIND_HTOD; + act.srcKind = CUPTI_ACTIVITY_MEMORY_KIND_PINNED; + act.dstKind = CUPTI_ACTIVITY_MEMORY_KIND_DEVICE; + activities.push_back(reinterpret_cast(&act)); + } + + template + T& createActivity( + int64_t start_us, int64_t end_us, int64_t correlation) { + T& act = *static_cast(malloc(sizeof(T))); + bzero(&act, sizeof(act)); + act.start = start_us * 1000; + act.end = end_us * 1000; + act.correlationId = correlation; + return act; + } + + ~MockCuptiActivityBuffer() { + for (CUpti_Activity* act : activities) { + free(act); + } + } + + std::vector activities; +}; + +// Mock parts of the CuptiActivityApi +class MockCuptiActivities : public CuptiActivityApi { + public: + virtual int smCount() override { + return 10; + } + + virtual const std::pair processActivities( + CuptiActivityBufferMap&, /*unused*/ + std::function handler) override { + for (CUpti_Activity* act : activityBuffer->activities) { + handler(act); + } + return {activityBuffer->activities.size(), 100}; + } + + virtual std::unique_ptr + activityBuffers() override { + auto map = std::make_unique(); + auto buf = std::make_unique(100); + uint8_t* addr = buf->data(); + (*map)[addr] = std::move(buf); + return map; + } + + void bufferRequestedOverride(uint8_t** buffer, size_t* size, size_t* maxNumRecords) { + this->bufferRequested(buffer, size, maxNumRecords); + } + + std::unique_ptr activityBuffer; +}; + + +// Common setup / teardown and helper functions +class CuptiActivityProfilerTest : public ::testing::Test { + protected: + void SetUp() override { + profiler_ = std::make_unique( + cuptiActivities_, /*cpu only*/ false); + cfg_ = std::make_unique(); + cfg_->validate(std::chrono::system_clock::now()); + loggerFactory.addProtocol("file", [](const std::string& url) { + return std::unique_ptr(new ChromeTraceLogger(url)); + }); + } + + std::unique_ptr cfg_; + MockCuptiActivities cuptiActivities_; + std::unique_ptr profiler_; + ActivityLoggerFactory loggerFactory; +}; + +void checkTracefile(const char* filename) { +#ifdef __linux__ + // Check that the expected file was written and that it has some content + int fd = open(filename, O_RDONLY); + if (!fd) { + perror(filename); + } + EXPECT_TRUE(fd); + // Should expect at least 100 bytes + struct stat buf{}; + fstat(fd, &buf); + EXPECT_GT(buf.st_size, 100); + close(fd); +#endif +} + +TEST(CuptiActivityProfiler, AsyncTrace) { + std::vector log_modules( + {"CuptiActivityProfiler.cpp", "output_json.cpp"}); + SET_LOG_VERBOSITY_LEVEL(1, log_modules); + + MockCuptiActivities activities; + CuptiActivityProfiler profiler(activities, /*cpu only*/ true); + + char filename[] = "/tmp/libkineto_testXXXXXX.json"; + mkstemps(filename, 5); + + Config cfg; + + int iter = 0; + int warmup = 5; + auto now = system_clock::now(); + auto startTime = now + seconds(10); + + bool success = cfg.parse(fmt::format(R"CFG( + ACTIVITIES_WARMUP_PERIOD_SECS = {} + ACTIVITIES_DURATION_SECS = 1 + ACTIVITIES_LOG_FILE = {} + PROFILE_START_TIME = {} + )CFG", warmup, filename, duration_cast(startTime.time_since_epoch()).count())); + + EXPECT_TRUE(success); + EXPECT_FALSE(profiler.isActive()); + + auto logger = std::make_unique(cfg.activitiesLogFile()); + + // Usually configuration is done when now is startTime - warmup to kick off warmup + // but start right away in the test + profiler.configure(cfg, now); + profiler.setLogger(logger.get()); + + EXPECT_TRUE(profiler.isActive()); + + // fast forward in time and we have reached the startTime + now = startTime; + + // Run the profiler + // Warmup + // performRunLoopStep is usually called by the controller loop and takes + // the current time and the controller's next wakeup time. + profiler.performRunLoopStep( + /* Current time */ now, /* Next wakeup time */ now); + + auto next = now + milliseconds(1000); + + // performRunLoopStep can also be called by an application thread to update iteration count + // since this config does not use iteration this should have no effect on the state + while (++iter < 20) { + profiler.performRunLoopStep(now, now, iter); + } + + // Runloop should now be in collect state, so start workload + // Perform another runloop step, passing in the end profile time as current. + // This should terminate collection + profiler.performRunLoopStep( + /* Current time */ next, /* Next wakeup time */ next); + // One step needed for each of the Process and Finalize phases + // Doesn't really matter what times we pass in here. + + EXPECT_TRUE(profiler.isActive()); + + auto nextnext = next + milliseconds(1000); + + while (++iter < 40) { + profiler.performRunLoopStep(next, next, iter); + } + + EXPECT_TRUE(profiler.isActive()); + + profiler.performRunLoopStep(nextnext,nextnext); + profiler.performRunLoopStep(nextnext,nextnext); + + // Assert that tracing has completed + EXPECT_FALSE(profiler.isActive()); + + checkTracefile(filename); +} + +TEST(CuptiActivityProfiler, AsyncTraceUsingIter) { + std::vector log_modules( + {"CuptiActivityProfiler.cpp", "output_json.cpp"}); + SET_LOG_VERBOSITY_LEVEL(1, log_modules); + + auto runIterTest = [&]( + int start_iter, int warmup_iters, int trace_iters) { + + LOG(INFO ) << "Async Trace Test: start_iteration = " << start_iter + << " warmup iterations = " << warmup_iters + << " trace iterations = " << trace_iters; + + MockCuptiActivities activities; + CuptiActivityProfiler profiler(activities, /*cpu only*/ true); + + char filename[] = "/tmp/libkineto_testXXXXXX.json"; + mkstemps(filename, 5); + + Config cfg; + + int iter = 0; + auto now = system_clock::now(); + + bool success = cfg.parse(fmt::format(R"CFG( + PROFILE_START_ITERATION = {} + ACTIVITIES_WARMUP_ITERATIONS={} + ACTIVITIES_ITERATIONS={} + ACTIVITIES_DURATION_SECS = 1 + ACTIVITIES_LOG_FILE = {} + )CFG", start_iter, warmup_iters, trace_iters, filename)); + + EXPECT_TRUE(success); + EXPECT_FALSE(profiler.isActive()); + + auto logger = std::make_unique(cfg.activitiesLogFile()); + + // Usually configuration is done when now is startIter - warmup iter to kick off warmup + // but start right away in the test + while (iter < (start_iter - warmup_iters)) { + profiler.performRunLoopStep(now, now, iter++); + } + + profiler.configure(cfg, now); + profiler.setLogger(logger.get()); + + EXPECT_TRUE(profiler.isActive()); + + // fast forward in time, mimicking what will happen in reality + now += seconds(10); + auto next = now + milliseconds(1000); + + // this call to runloop step should not be effecting the state + profiler.performRunLoopStep(now, next); + EXPECT_TRUE(profiler.isActive()); + + // start trace collection + while (iter < start_iter) { + profiler.performRunLoopStep(now, next, iter++); + } + + // Runloop should now be in collect state, so start workload + + while (iter < (start_iter + trace_iters)) { + profiler.performRunLoopStep(now, next, iter++); + } + + // One step is required for each of the Process and Finalize phases + // Doesn't really matter what times we pass in here. + if (iter >= (start_iter + trace_iters)) { + profiler.performRunLoopStep(now, next, iter++); + } + EXPECT_TRUE(profiler.isActive()); + + auto nextnext = next + milliseconds(1000); + + profiler.performRunLoopStep(nextnext, nextnext); + profiler.performRunLoopStep(nextnext, nextnext); + + // Assert that tracing has completed + EXPECT_FALSE(profiler.isActive()); + + checkTracefile(filename); + }; + + // start iter = 50, warmup iters = 5, trace iters = 10 + runIterTest(50, 5, 10); + // should be able to start at 0 iteration + runIterTest(0, 0, 2); + runIterTest(0, 5, 5); +} + +TEST_F(CuptiActivityProfilerTest, SyncTrace) { + using ::testing::Return; + using ::testing::ByMove; + + // Verbose logging is useful for debugging + std::vector log_modules( + {"CuptiActivityProfiler.cpp"}); + SET_LOG_VERBOSITY_LEVEL(2, log_modules); + + // Start and stop profiling + CuptiActivityProfiler profiler(cuptiActivities_, /*cpu only*/ false); + int64_t start_time_us = 100; + int64_t duration_us = 300; + auto start_time = time_point(microseconds(start_time_us)); + profiler.configure(*cfg_, start_time); + profiler.startTrace(start_time); + profiler.stopTrace(start_time + microseconds(duration_us)); + + profiler.recordThreadInfo(); + + // Log some cpu ops + auto cpuOps = std::make_unique( + start_time_us, start_time_us + duration_us); + cpuOps->addOp("op1", 120, 150, 1); + cpuOps->addOp("op2", 130, 140, 2); + cpuOps->addOp("op3", 200, 250, 3); + profiler.transferCpuTrace(std::move(cpuOps)); + + // And some GPU ops + auto gpuOps = std::make_unique(); + gpuOps->addRuntimeActivity(CUDA_LAUNCH_KERNEL, 133, 138, 1); + gpuOps->addRuntimeActivity(CUDA_MEMCPY, 210, 220, 2); + gpuOps->addRuntimeActivity(CUDA_LAUNCH_KERNEL, 230, 245, 3); + gpuOps->addKernelActivity(150, 170, 1); + gpuOps->addMemcpyActivity(240, 250, 2); + gpuOps->addKernelActivity(260, 320, 3); + cuptiActivities_.activityBuffer = std::move(gpuOps); + + // Have the profiler process them + auto logger = std::make_unique(*cfg_); + profiler.processTrace(*logger); + + // Profiler can be reset at this point - logger owns the activities + profiler_->reset(); + + // Wrapper that allows iterating over the activities + ActivityTrace trace(std::move(logger), loggerFactory); + EXPECT_EQ(trace.activities()->size(), 9); + std::map activityCounts; + std::map resourceIds; + for (auto& activity : *trace.activities()) { + activityCounts[activity->name()]++; + resourceIds[activity->resourceId()]++; + } + for (const auto& p : activityCounts) { + LOG(INFO) << p.first << ": " << p.second; + } + EXPECT_EQ(activityCounts["op1"], 1); + EXPECT_EQ(activityCounts["op2"], 1); + EXPECT_EQ(activityCounts["op3"], 1); + EXPECT_EQ(activityCounts["cudaLaunchKernel"], 2); + EXPECT_EQ(activityCounts["cudaMemcpy"], 1); + EXPECT_EQ(activityCounts["kernel"], 2); + EXPECT_EQ(activityCounts["Memcpy HtoD (Pinned -> Device)"], 1); + + auto sysTid = systemThreadId(); + // Ops and runtime events are on thread sysTid + EXPECT_EQ(resourceIds[sysTid], 6); + // Kernels are on stream 1, memcpy on stream 2 + EXPECT_EQ(resourceIds[1], 2); + EXPECT_EQ(resourceIds[2], 1); + +#ifdef __linux__ + char filename[] = "/tmp/libkineto_testXXXXXX.json"; + mkstemps(filename, 5); + trace.save(filename); + // Check that the expected file was written and that it has some content + int fd = open(filename, O_RDONLY); + if (!fd) { + perror(filename); + } + EXPECT_TRUE(fd); + // Should expect at least 100 bytes + struct stat buf{}; + fstat(fd, &buf); + EXPECT_GT(buf.st_size, 100); +#endif +} + +TEST_F(CuptiActivityProfilerTest, GpuUserAnnotationTest) { + // Verbose logging is useful for debugging + std::vector log_modules( + {"CuptiActivityProfiler.cpp"}); + SET_LOG_VERBOSITY_LEVEL(2, log_modules); + + // Start and stop profiling + CuptiActivityProfiler profiler(cuptiActivities_, /*cpu only*/ false); + int64_t start_time_us = 100; + int64_t duration_us = 300; + auto start_time = time_point(microseconds(start_time_us)); + profiler.configure(*cfg_, start_time); + profiler.startTrace(start_time); + profiler.stopTrace(start_time + microseconds(duration_us)); + + int64_t kernelLaunchTime = 120; + profiler.recordThreadInfo(); + + // set up CPU event + auto cpuOps = std::make_unique( + start_time_us, start_time_us + duration_us); + cpuOps->addOp("annotation", kernelLaunchTime, kernelLaunchTime + 10, 1); + profiler.transferCpuTrace(std::move(cpuOps)); + + // set up a couple of GPU events and correlate with above CPU event. + // CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM1 is used for user annotations. + auto gpuOps = std::make_unique(); + gpuOps->addCorrelationActivity(1, CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM1, 1); + gpuOps->addKernelActivity(kernelLaunchTime + 5, kernelLaunchTime + 10, 1); + gpuOps->addCorrelationActivity(1, CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM1, 1); + gpuOps->addKernelActivity(kernelLaunchTime + 15, kernelLaunchTime + 25, 1); + cuptiActivities_.activityBuffer = std::move(gpuOps); + + // process trace + auto logger = std::make_unique(*cfg_); + profiler.processTrace(*logger); + + ActivityTrace trace(std::move(logger), loggerFactory); + std::map counts; + for (auto& activity : *trace.activities()) { + counts[activity->name()]++; + } + + // We should now have an additional annotation activity created + // on the GPU timeline. + EXPECT_EQ(counts["annotation"], 2); + EXPECT_EQ(counts["kernel"], 2); + + auto& annotation = trace.activities()->at(0); + auto& kernel1 = trace.activities()->at(1); + auto& kernel2 = trace.activities()->at(2); + auto& gpu_annotation = trace.activities()->at(3); + EXPECT_EQ(gpu_annotation->type(), ActivityType::GPU_USER_ANNOTATION); + EXPECT_EQ(gpu_annotation->timestamp(), kernel1->timestamp()); + EXPECT_EQ( + gpu_annotation->duration(), + kernel2->timestamp() + kernel2->duration() - kernel1->timestamp()); + EXPECT_EQ(gpu_annotation->deviceId(), kernel1->deviceId()); + EXPECT_EQ(gpu_annotation->resourceId(), kernel1->resourceId()); + EXPECT_EQ(gpu_annotation->correlationId(), annotation->correlationId()); + EXPECT_EQ(gpu_annotation->name(), annotation->name()); +} + +TEST_F(CuptiActivityProfilerTest, SubActivityProfilers) { + using ::testing::Return; + using ::testing::ByMove; + + // Verbose logging is useful for debugging + std::vector log_modules( + {"CuptiActivityProfiler.cpp"}); + SET_LOG_VERBOSITY_LEVEL(2, log_modules); + + // Setup example events to test + GenericTraceActivity ev{defaultTraceSpan(), ActivityType::GLOW_RUNTIME, ""}; + ev.device = 1; + ev.resource = 0; + + int64_t start_time_us = 100; + int64_t duration_us = 1000; + auto start_time = time_point(microseconds(start_time_us)); + + std::vector test_activities{3, ev}; + test_activities[0].startTime = start_time_us; + test_activities[0].endTime = start_time_us + 5000; + test_activities[0].activityName = "SubGraph A execution"; + test_activities[1].startTime = start_time_us; + test_activities[1].endTime = start_time_us + 2000; + test_activities[1].activityName = "Operator foo"; + test_activities[2].startTime = start_time_us + 2500; + test_activities[2].endTime = start_time_us + 2900; + test_activities[2].activityName = "Operator bar"; + + auto mock_activity_profiler = + std::make_unique(test_activities); + + MockCuptiActivities activities; + CuptiActivityProfiler profiler(activities, /*cpu only*/ true); + profiler.addChildActivityProfiler( + std::move(mock_activity_profiler)); + + profiler.configure(*cfg_, start_time); + profiler.startTrace(start_time); + EXPECT_TRUE(profiler.isActive()); + + profiler.stopTrace(start_time + microseconds(duration_us)); + EXPECT_TRUE(profiler.isActive()); + + char filename[] = "/tmp/libkineto_testXXXXXX.json"; + mkstemps(filename, 5); + LOG(INFO) << "Logging to tmp file " << filename; + + // process trace + auto logger = std::make_unique(*cfg_); + profiler.processTrace(*logger); + profiler.setLogger(logger.get()); + + ActivityTrace trace(std::move(logger), loggerFactory); + trace.save(filename); + const auto& traced_activites = trace.activities(); + + // Test we have all the events + EXPECT_EQ(traced_activites->size(), test_activities.size()); + + // Check that the expected file was written and that it has some content + int fd = open(filename, O_RDONLY); + if (!fd) { + perror(filename); + } + EXPECT_TRUE(fd); + + // Should expect at least 100 bytes + struct stat buf{}; + fstat(fd, &buf); + EXPECT_GT(buf.st_size, 100); +} + +TEST_F(CuptiActivityProfilerTest, BufferSizeLimitTestWarmup) { + CuptiActivityProfiler profiler(cuptiActivities_, /*cpu only*/ false); + + auto now = system_clock::now(); + auto startTime = now + seconds(10); + + int maxBufferSizeMB = 3; + + auto startTimeEpoch = std::to_string(duration_cast(startTime.time_since_epoch()).count()); + std::string maxBufferSizeMBStr = std::to_string(maxBufferSizeMB); + cfg_->handleOption("ACTIVITIES_MAX_GPU_BUFFER_SIZE_MB", maxBufferSizeMBStr); + cfg_->handleOption("PROFILE_START_TIME", startTimeEpoch); + + + EXPECT_FALSE(profiler.isActive()); + profiler.configure(*cfg_, now); + EXPECT_TRUE(profiler.isActive()); + + for (size_t i = 0; i < maxBufferSizeMB; i++) { + uint8_t* buf; + size_t gpuBufferSize; + size_t maxNumRecords; + cuptiActivities_.bufferRequestedOverride(&buf, &gpuBufferSize, &maxNumRecords); + } + + // fast forward to startTime and profiler is now running + now = startTime; + + profiler.performRunLoopStep(now, now); + + auto next = now + milliseconds(1000); + profiler.performRunLoopStep(next, next); + profiler.performRunLoopStep(next, next); + profiler.performRunLoopStep(next, next); + + EXPECT_FALSE(profiler.isActive()); +} diff --git a/tb_plugins/profiling/libkineto/test/CuptiCallbackApiTest.cpp b/tb_plugins/profiling/libkineto/test/CuptiCallbackApiTest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..253b696da54d1919e9c0076c5691a11e35345686 --- /dev/null +++ b/tb_plugins/profiling/libkineto/test/CuptiCallbackApiTest.cpp @@ -0,0 +1,239 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include "src/Logger.h" +#include "src/CuptiCallbackApi.h" + +#include +#include +#include +#include + +using namespace std::chrono; +using namespace KINETO_NAMESPACE; +using namespace libkineto; + +const size_t some_data = 42; + +std::atomic simple_cb_calls = 0; + +void simple_cb( + CUpti_CallbackDomain domain, + CUpti_CallbackId cbid, + const CUpti_CallbackData* cbInfo) { + + // simple arg check + EXPECT_EQ(domain, CUPTI_CB_DOMAIN_RUNTIME_API); + EXPECT_EQ(cbid, CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000); + EXPECT_EQ(*reinterpret_cast(cbInfo), some_data); + + simple_cb_calls++; +} + +void atomic_cb( + CUpti_CallbackDomain /*domain*/, + CUpti_CallbackId /*cbid*/, + const CUpti_CallbackData* /*cbInfo)*/) { + // do some atomics in a loop + for (int i = 0; i < 1000; i++) { + // would have used release consistency but this is fine + simple_cb_calls++; + } +} + +void empty_cb( + CUpti_CallbackDomain /*domain*/, + CUpti_CallbackId /*cbid*/, + const CUpti_CallbackData* /*cbInfo*/) { +} + +TEST(CuptiCallbackApiTest, SimpleTest) { + auto& api = CuptiCallbackApi::singleton(); + + auto addSimpleCallback = [&]() -> bool { + bool ret = api.registerCallback( + CUPTI_CB_DOMAIN_RUNTIME_API, + CuptiCallbackApi::CUDA_LAUNCH_KERNEL, + &simple_cb + ); + return ret; + }; + EXPECT_TRUE(addSimpleCallback()) << "Failed to add callback"; + + // duplicate add should be okay + EXPECT_TRUE(addSimpleCallback()) << "Failed to re-add callback"; + + simple_cb_calls = 0; + + // simulate callback + api.__callback_switchboard( + CUPTI_CB_DOMAIN_RUNTIME_API, + CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000, + reinterpret_cast(&some_data)); + + EXPECT_EQ(simple_cb_calls, 1); + + bool ret = api.deleteCallback( + CUPTI_CB_DOMAIN_RUNTIME_API, + CuptiCallbackApi::CUDA_LAUNCH_KERNEL, + &simple_cb + ); + + EXPECT_TRUE(ret) << "Failed to remove callback"; + + ret = api.deleteCallback( + CUPTI_CB_DOMAIN_RUNTIME_API, + CuptiCallbackApi::CUDA_LAUNCH_KERNEL, + &atomic_cb + ); + + EXPECT_FALSE(ret) << "oops! deleted a callback that was never added"; +} + +TEST(CuptiCallbackApiTest, AllCallbacks) { + auto& api = CuptiCallbackApi::singleton(); + + auto testCallback = [&]( + CUpti_CallbackDomain domain, + CUpti_CallbackId cbid, + CuptiCallbackApi::CuptiCallBackID kineto_cbid) -> bool { + + bool ret = api.registerCallback(domain, kineto_cbid, atomic_cb); + EXPECT_TRUE(ret) << "Failed to add callback"; + + if (!ret) { + return false; + } + + simple_cb_calls = 0; + api.__callback_switchboard(domain, cbid, nullptr); + EXPECT_EQ(simple_cb_calls, 1000); + ret = simple_cb_calls == 1000; + + EXPECT_TRUE(api.deleteCallback(domain, kineto_cbid, atomic_cb)); + + return ret; + }; + + EXPECT_TRUE( + testCallback( + CUPTI_CB_DOMAIN_RESOURCE, + CUPTI_CBID_RESOURCE_CONTEXT_CREATED, + CuptiCallbackApi::RESOURCE_CONTEXT_CREATED)) + << "Failed to run callback for RESOURCE_CONTEXT_CREATED"; + + EXPECT_TRUE( + testCallback( + CUPTI_CB_DOMAIN_RESOURCE, + CUPTI_CBID_RESOURCE_CONTEXT_DESTROY_STARTING, + CuptiCallbackApi::RESOURCE_CONTEXT_DESTROYED)) + << "Failed to run callback for RESOURCE_CONTEXT_DESTROYED"; + + EXPECT_TRUE( + testCallback( + CUPTI_CB_DOMAIN_RUNTIME_API, + CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000, + CuptiCallbackApi::CUDA_LAUNCH_KERNEL)) + << "Failed to run callback for CUDA_LAUNCH_KERNEL"; + +} + +TEST(CuptiCallbackApiTest, ContentionTest) { + auto& api = CuptiCallbackApi::singleton(); + const CUpti_CallbackDomain domain = CUPTI_CB_DOMAIN_RUNTIME_API; + const CUpti_CallbackId cbid = CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000; + const CuptiCallbackApi::CuptiCallBackID kineto_cbid = + CuptiCallbackApi::CUDA_LAUNCH_KERNEL; + + bool ret = api.registerCallback(domain, kineto_cbid, empty_cb); + EXPECT_TRUE(ret) << "Failed to add callback"; + + const int iters = 10000; + const int num_readers = 8; + + simple_cb_calls = 0; + + // simulate callbacks being executed on multiple threads in parallel + // during this interval add a new atomic_callback. + // this test ensured mutual exclusion is working fine + auto read_fn = [&](int tid){ + auto start_ts = high_resolution_clock::now(); + for (int i = 0; i < iters; i++) { + api.__callback_switchboard(domain, cbid, nullptr); + } + auto runtime_ms = duration_cast( + high_resolution_clock::now() - start_ts); + LOG(INFO) << "th " << tid << " done in " << runtime_ms.count() << " ms"; + }; + + + std::vector read_ths; + for (int i = 0; i< num_readers; i++) { + read_ths.emplace_back(read_fn, i); + } + + ret = api.registerCallback(domain, kineto_cbid, atomic_cb); + EXPECT_TRUE(ret) << "Failed to add callback"; + + for (auto& t : read_ths) { + t.join(); + } + + //EXPECT_GT(simple_cb_calls, 0) + // << "Atomic callback should have been called at least once."; + + api.deleteCallback(domain, kineto_cbid, empty_cb); + api.deleteCallback(domain, kineto_cbid, atomic_cb); +} + +TEST(CuptiCallbackApiTest, Bechmark) { + + constexpr int iters = 1000; + // atomic bench a number of times to get a baseline + + const CUpti_CallbackDomain domain = CUPTI_CB_DOMAIN_RUNTIME_API; + const CUpti_CallbackId cbid = CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000; + const CuptiCallbackApi::CuptiCallBackID kineto_cbid = + CuptiCallbackApi::CUDA_LAUNCH_KERNEL; + + LOG(INFO) << "Iteration count = " << iters; + + const bool use_empty = true; + auto cbfn = use_empty ? &empty_cb : &atomic_cb; + + // warmup + for (int i = 0; i < 50; i++) { + (*cbfn)(domain, cbid, nullptr); + } + + auto start_ts = high_resolution_clock::now(); + for (int i = 0; i < iters; i++) { + (*cbfn)(domain, cbid, nullptr); + } + auto delta_baseline_ns = duration_cast( + high_resolution_clock::now() - start_ts); + LOG(INFO) << "Baseline runtime = " << delta_baseline_ns.count() << " ns"; + + + auto& api = CuptiCallbackApi::singleton(); + bool ret = api.registerCallback(domain, kineto_cbid, cbfn); + EXPECT_TRUE(ret) << "Failed to add callback"; + + // warmup + for (int i = 0; i < 50; i++) { + api.__callback_switchboard(domain, cbid, nullptr); + } + + start_ts = high_resolution_clock::now(); + for (int i = 0; i < iters; i++) { + api.__callback_switchboard(domain, cbid, nullptr); + } + + auto delta_callback_ns = duration_cast( + high_resolution_clock::now() - start_ts); + LOG(INFO) << "Callback runtime = " << delta_callback_ns.count() << " ns"; + + LOG(INFO) << "Callback runtime per iteration = " << + (delta_callback_ns.count() - delta_baseline_ns.count()) / (double) iters + << " ns"; + +} diff --git a/tb_plugins/profiling/libkineto/test/CuptiProfilerApiTest.cu b/tb_plugins/profiling/libkineto/test/CuptiProfilerApiTest.cu new file mode 100644 index 0000000000000000000000000000000000000000..54ad51b0a1fc9a6a54585d1cad4674943c874b98 --- /dev/null +++ b/tb_plugins/profiling/libkineto/test/CuptiProfilerApiTest.cu @@ -0,0 +1,353 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include +#include +#include + +#include + +// TODO(T90238193) +// @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude +#include "src/Logger.h" +#include "src/CuptiRangeProfilerApi.h" + +#define DRIVER_API_CALL(apiFuncCall) \ + do { \ + CUresult _status = apiFuncCall; \ + if (_status != CUDA_SUCCESS) { \ + LOG(ERROR) << "Failed invoking CUDA driver function " \ + << #apiFuncCall << " status = " \ + << _status; \ + exit(-1); \ + } \ + } while (0) + +#define EXPECT(expr)\ + if (!(expr)) {\ + }; + +using namespace KINETO_NAMESPACE; + +static int numRanges = 1; + +using Type = double; + +// Device code +__global__ void VecAdd(const Type* A, const Type* B, Type* C, int N) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < N) { + C[i] = A[i] + B[i]; + } +} + +// Device code +__global__ void VecSub(const Type* A, const Type* B, Type* C, int N) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < N) { + C[i] = A[i] - B[i]; + } +} + +static void initVec(Type* vec, int n) { + for (int i = 0; i < n; i++) { + vec[i] = i; + } +} + +static void cleanUp( + Type* h_A, + Type* h_B, + Type* h_C, + Type* h_D, + Type* d_A, + Type* d_B, + Type* d_C, + Type* d_D) { + if (d_A) + cudaFree(d_A); + if (d_B) + cudaFree(d_B); + if (d_C) + cudaFree(d_C); + if (d_D) + cudaFree(d_D); + + // Free host memory + if (h_A) + free(h_A); + if (h_B) + free(h_B); + if (h_C) + free(h_C); + if (h_D) + free(h_D); +} + +/* Benchmark application used to test profiler measurements + * This simply runs two kernels vector Add and Vector Subtract + */ + +void VectorAddSubtract() { + int N = 50000; + size_t size = N * sizeof(Type); + int threadsPerBlock = 0; + int blocksPerGrid = 0; + Type *h_A, *h_B, *h_C, *h_D; + Type *d_A, *d_B, *d_C, *d_D; + int i; + Type sum, diff; + + // Allocate input vectors h_A and h_B in host memory + h_A = (Type*)malloc(size); + h_B = (Type*)malloc(size); + h_C = (Type*)malloc(size); + h_D = (Type*)malloc(size); + + // Initialize input vectors + initVec(h_A, N); + initVec(h_B, N); + memset(h_C, 0, size); + memset(h_D, 0, size); + + // Allocate vectors in device memory + cudaMalloc((void**)&d_A, size); + cudaMalloc((void**)&d_B, size); + cudaMalloc((void**)&d_C, size); + cudaMalloc((void**)&d_D, size); + + // Copy vectors from host memory to device memory + cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice); + cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice); + + // Invoke kernel + threadsPerBlock = 256; + blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; + LOG(INFO) << fmt::format( + "Launching kernel: blocks {}, thread/block {}", + blocksPerGrid, + threadsPerBlock); + + VecAdd<<>>(d_A, d_B, d_C, N); + + VecSub<<>>(d_A, d_B, d_D, N); + + // Copy result from device memory to host memory + // h_C contains the result in host memory + cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost); + cudaMemcpy(h_D, d_D, size, cudaMemcpyDeviceToHost); + + // Verify result + for (i = 0; i < N; ++i) { + sum = h_A[i] + h_B[i]; + diff = h_A[i] - h_B[i]; + if (h_C[i] != sum || h_D[i] != diff) { + LOG(ERROR) << "Result verification failed"; + break; + } + } + + cleanUp(h_A, h_B, h_C, h_D, d_A, d_B, d_C, d_D); +} + +#if HAS_CUPTI_RANGE_PROFILER +bool runTestWithAutoRange( + int deviceNum, + const std::vector& metricNames, + CUcontext cuContext, + bool async) { + + // create a CUPTI range based profiling profiler + // this configures the counter data as well + CuptiRBProfilerSession profiler( + metricNames, deviceNum, 2, 1, async ? nullptr : cuContext); + + CUpti_ProfilerRange profilerRange = CUPTI_AutoRange; + CUpti_ProfilerReplayMode profilerReplayMode = CUPTI_KernelReplay; + + if (async) { + profiler.asyncStartAndEnable(profilerRange, profilerReplayMode); + } else { + profiler.start(profilerRange, profilerReplayMode); + profiler.enable(); + } + + VectorAddSubtract(); + + if (!async) { + profiler.disable(); + // stop profiler + profiler.stop(); + } else { + profiler.asyncDisableAndStop(); + } + + auto result = profiler.evaluateMetrics(true); + + // check results + EXPECT_EQ(result.metricNames.size(), 3); + EXPECT_EQ(result.rangeVals.size(), 2); + + for (const auto& measurement : result.rangeVals) { + EXPECT_EQ(measurement.values.size(), 3); + + if (measurement.values.size() == 3) { + // smsp__warps_launched.avg + EXPECT_NE(measurement.values[0], 0); + // smsp__sass_thread_inst_executed_op_dadd_pred_on.sum + // each kernel has 50000 dadd ops + EXPECT_EQ(measurement.values[1], 50000); + // sm__inst_executed_pipe_tensor.sum + //EXPECT_EQ(measurement.values[2], 0); + } + } + return true; +} + +bool runTestWithUserRange( + int deviceNum, + const std::vector& metricNames, + CUcontext cuContext, + bool async = false) { + + // create a CUPTI range based profiling profiler + // this configures the counter data as well + CuptiRBProfilerSession profiler( + metricNames, deviceNum, numRanges, 1, async ? nullptr : cuContext); + + CUpti_ProfilerRange profilerRange = CUPTI_UserRange; + CUpti_ProfilerReplayMode profilerReplayMode = CUPTI_UserReplay; + + if (async) { + profiler.asyncStartAndEnable(profilerRange, profilerReplayMode); + { VectorAddSubtract(); } + profiler.disableAndStop(); + } else { + profiler.start(profilerRange, profilerReplayMode); + + /* User takes the resposiblity of replaying the kernel launches */ + bool replay = true; + do { + profiler.beginPass(); + { + profiler.enable(); + + std::string rangeName = "vecAddSub"; + profiler.pushRange(rangeName); + + { VectorAddSubtract(); } + + profiler.popRange(); + profiler.disable(); + } + LOG(INFO) << "Replay starting."; + replay = profiler.endPass(); + + } while (!replay); + + // stop profiler + profiler.stop(); + } + VectorAddSubtract(); + auto result = profiler.evaluateMetrics(true); + + // check results + EXPECT_EQ(result.metricNames.size(), 3); + EXPECT_EQ(result.rangeVals.size(), 1); + + if (result.rangeVals.size() > 0) { + const auto& measurement = result.rangeVals[0]; + EXPECT_EQ(measurement.values.size(), 3); + + if (measurement.values.size() == 3) { + // smsp__warps_launched.avg + EXPECT_NE(measurement.values[0], 0); + // smsp__sass_thread_inst_executed_op_dadd_pred_on.sum + // in async mode multiple passes are not supported yet + if (!async) { + EXPECT_EQ(measurement.values[1], 100000); + } + // sm__inst_executed_pipe_tensor.sum + //EXPECT_EQ(measurement.values[2], 0); + } + } + return true; +} +#endif // HAS_CUPTI_RANGE_PROFILER + +int main(int argc, char* argv[]) { + + CUdevice cuDevice; + + int deviceCount, deviceNum; + int computeCapabilityMajor = 0, computeCapabilityMinor = 0; + + printf("Usage: %s [device_num]\n", argv[0]); + + DRIVER_API_CALL(cuInit(0)); + DRIVER_API_CALL(cuDeviceGetCount(&deviceCount)); + + if (deviceCount == 0) { + LOG(ERROR) << "There is no device supporting CUDA."; + return -2; + } + + if (argc > 1) + deviceNum = atoi(argv[1]); + else + deviceNum = 0; + LOG(INFO) << "CUDA Device Number: " << deviceNum; + + DRIVER_API_CALL(cuDeviceGet(&cuDevice, deviceNum)); + DRIVER_API_CALL(cuDeviceGetAttribute( + &computeCapabilityMajor, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, + cuDevice)); + DRIVER_API_CALL(cuDeviceGetAttribute( + &computeCapabilityMinor, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, + cuDevice)); + + LOG(INFO) << "Compute Cabapbility = " + << fmt::format("{},{}",computeCapabilityMajor, computeCapabilityMinor); + + if (computeCapabilityMajor < 7) { + LOG(ERROR) << "CUPTI Profiler is not supported with compute capability < 7.0"; + return -2; + } + + CuptiRBProfilerSession::staticInit(); + + // metrics to profile + std::vector metricNames = { + "smsp__warps_launched.avg", + "smsp__sass_thread_inst_executed_op_dadd_pred_on.sum", + "sm__inst_executed_pipe_tensor.sum", + }; + + CUcontext cuContext; + DRIVER_API_CALL(cuCtxCreate(&cuContext, 0, cuDevice)); + + VectorAddSubtract(); + +#if HAS_CUPTI_RANGE_PROFILER + CuptiRBProfilerSession::staticInit(); + + if (!runTestWithUserRange(deviceNum, metricNames, cuContext, false)) { + LOG(ERROR) << "Failed to profiler test benchmark in user range"; + } else if (!runTestWithAutoRange(deviceNum, metricNames, cuContext, false)) { + LOG(ERROR) << "Failed to profiler test benchmark in auto range"; + } else if (!runTestWithUserRange(deviceNum, metricNames, cuContext, true)) { + LOG(ERROR) << "Failed to profiler test benchmark in user range async"; + } else if (!runTestWithAutoRange(deviceNum, metricNames, cuContext, true)) { + LOG(ERROR) << "Failed to profiler test benchmark in auto range async"; + } + + CuptiRBProfilerSession::deInitCupti(); +#else + LOG(WARNING) << "CuptiRBProfilerSession is not supported."; +#endif // HAS_CUPTI_RANGE_PROFILER + DRIVER_API_CALL(cuCtxDestroy(cuContext)); + + + return 0; +} diff --git a/tb_plugins/profiling/libkineto/test/CuptiRangeProfilerApiTest.cpp b/tb_plugins/profiling/libkineto/test/CuptiRangeProfilerApiTest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..28cad722c53ee5defaa7c24cbe0d6b2cbc840a30 --- /dev/null +++ b/tb_plugins/profiling/libkineto/test/CuptiRangeProfilerApiTest.cpp @@ -0,0 +1,113 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include +#include +#include + +#include "include/libkineto.h" +#include "include/Config.h" +#include "src/CuptiRangeProfilerApi.h" + +#include "src/Logger.h" +#include "test/CuptiRangeProfilerTestUtil.h" + +using namespace KINETO_NAMESPACE; + +#if HAS_CUPTI_PROFILER + +TEST(CuptiRangeProfilerApiTest, contextTracking) { + std::vector log_modules( + {"CuptiRangeProfilerApi.cpp"}); + SET_LOG_VERBOSITY_LEVEL(1, log_modules); + + std::array data; + std::array contexts; + for (int i = 0; i < data.size(); i++) { + contexts[i] = reinterpret_cast(&data[i]); + } + + // simulate creating contexts, this calls the trackCudaContexts + // function that would otherwise be called via a callback + uint32_t dev = 0; + for (auto ctx : contexts) { + simulateCudaContextCreate(ctx, dev++); + } + + EXPECT_EQ( + CuptiRBProfilerSession::getActiveDevices(), + std::set({0, 1, 2})); + + simulateCudaContextDestroy(contexts[1], 1); + + EXPECT_EQ( + CuptiRBProfilerSession::getActiveDevices(), + std::set({0, 2})); + + simulateCudaContextDestroy(contexts[0], 0); + simulateCudaContextDestroy(contexts[2], 2); + + EXPECT_TRUE( + CuptiRBProfilerSession::getActiveDevices().empty()); +} + +TEST(CuptiRangeProfilerApiTest, asyncLaunchUserRange) { + std::vector log_modules( + {"CuptiRangeProfilerApi.cpp"}); + SET_LOG_VERBOSITY_LEVEL(1, log_modules); + + // this is bad but the pointer is never accessed + CUcontext ctx0 = reinterpret_cast(10); + simulateCudaContextCreate(ctx0, 0 /*device_id*/); + + auto session = std::make_unique(0, ctx0); + session->asyncStartAndEnable(CUPTI_UserRange, CUPTI_UserReplay); + + simulateKernelLaunch(ctx0, "hello"); + simulateKernelLaunch(ctx0, "foo"); + simulateKernelLaunch(ctx0, "bar"); + + session->asyncDisableAndStop(); + // stop happens after next kernel is run + simulateKernelLaunch(ctx0, "bar"); + simulateCudaContextDestroy(ctx0, 0 /*device_id*/); + + EXPECT_EQ(session->passes_ended, 1); + EXPECT_EQ(session->ranges_ended, 1); + EXPECT_TRUE(session->enabled); +} + +TEST(CuptiRangeProfilerApiTest, asyncLaunchAutoRange) { + std::vector log_modules( + {"CuptiRangeProfilerApi.cpp"}); + SET_LOG_VERBOSITY_LEVEL(1, log_modules); + + // this is bad but the pointer is never accessed + CUcontext ctx0 = reinterpret_cast(10); + CUcontext ctx1 = reinterpret_cast(11); + + simulateCudaContextCreate(ctx0, 0 /*device_id*/); + + auto session = std::make_unique(0, ctx0); + session->asyncStartAndEnable(CUPTI_AutoRange, CUPTI_KernelReplay); + + simulateKernelLaunch(ctx0, "hello"); + simulateKernelLaunch(ctx0, "foo"); + simulateKernelLaunch(ctx1, "kernel_on_different_device"); + simulateKernelLaunch(ctx0, "bar"); + + session->asyncDisableAndStop(); + // stop happens after next kernel is run + simulateKernelLaunch(ctx0, "bar"); + simulateCudaContextDestroy(ctx0, 0 /*device_id*/); + + EXPECT_EQ(session->passes_ended, 0); + EXPECT_EQ(session->ranges_ended, 0); + EXPECT_TRUE(session->enabled); + + EXPECT_EQ( + session->getKernelNames(), + std::vector({"hello", "foo", "bar"})) + << "Kernel names were not tracked"; +} + +#endif // HAS_CUPTI_PROFILER diff --git a/tb_plugins/profiling/libkineto/test/CuptiRangeProfilerConfigTest.cpp b/tb_plugins/profiling/libkineto/test/CuptiRangeProfilerConfigTest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3f568968238a0e376ab3bae621af00a162af0d25 --- /dev/null +++ b/tb_plugins/profiling/libkineto/test/CuptiRangeProfilerConfigTest.cpp @@ -0,0 +1,67 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include "include/Config.h" +#include "src/CuptiRangeProfilerConfig.h" + +#include +#include +#include +#include + +using namespace std::chrono; +using namespace KINETO_NAMESPACE; + +class CuptiRangeProfilerConfigTest : public ::testing::Test { + protected: + void SetUp() override { + CuptiRangeProfilerConfig::registerFactory(); + } +}; + +TEST_F(CuptiRangeProfilerConfigTest, ConfigureProfiler) { + Config cfg; + std::vector metrics = { + "kineto__cuda_core_flops", + "sm__inst_executed.sum", + "l1tex__data_bank_conflicts_pipe_lsu.sum", + }; + auto metricsConfigStr = + fmt::format("CUPTI_PROFILER_METRICS = {}", fmt::join(metrics, ",")); + + EXPECT_TRUE(cfg.parse(metricsConfigStr)); + EXPECT_TRUE(cfg.parse("CUPTI_PROFILER_ENABLE_PER_KERNEL = true")); + EXPECT_TRUE(cfg.parse("CUPTI_PROFILER_MAX_RANGES = 42")); + + const CuptiRangeProfilerConfig& cupti_cfg = + CuptiRangeProfilerConfig::get(cfg); + + EXPECT_EQ(cupti_cfg.activitiesCuptiMetrics(), metrics); + EXPECT_EQ(cupti_cfg.cuptiProfilerPerKernel(), true); + EXPECT_EQ(cupti_cfg.cuptiProfilerMaxRanges(), 42); + +} + +TEST_F(CuptiRangeProfilerConfigTest, RangesDefaults) { + Config cfg, cfg_auto; + + // do not set max ranges in config, check defaults are sane + EXPECT_TRUE(cfg.parse("CUPTI_PROFILER_METRICS = kineto__cuda_core_flops")); + EXPECT_TRUE(cfg.parse("CUPTI_PROFILER_ENABLE_PER_KERNEL = false")); + + cfg.setSignalDefaults(); + + EXPECT_TRUE(cfg_auto.parse("CUPTI_PROFILER_METRICS = kineto__cuda_core_flops")); + EXPECT_TRUE(cfg_auto.parse("CUPTI_PROFILER_ENABLE_PER_KERNEL = true")); + + cfg_auto.setClientDefaults(); + + int user_ranges, auto_ranges; + + user_ranges = CuptiRangeProfilerConfig::get(cfg).cuptiProfilerMaxRanges(); + auto_ranges = CuptiRangeProfilerConfig::get(cfg_auto).cuptiProfilerMaxRanges(); + + EXPECT_GE(user_ranges, 1) << " in user range mode default to at least 1 ranges"; + EXPECT_GE(auto_ranges, 1000) << " in auto range mode default to at least 1000 ranges"; + + EXPECT_GT(auto_ranges, user_ranges); +} diff --git a/tb_plugins/profiling/libkineto/test/CuptiRangeProfilerTestUtil.h b/tb_plugins/profiling/libkineto/test/CuptiRangeProfilerTestUtil.h new file mode 100644 index 0000000000000000000000000000000000000000..861b65fd701bf69373df657ab2a22d9dba0b27df --- /dev/null +++ b/tb_plugins/profiling/libkineto/test/CuptiRangeProfilerTestUtil.h @@ -0,0 +1,96 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include +#include + +// TODO(T90238193) +// @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude +#include "CuptiRangeProfilerApi.h" + +namespace KINETO_NAMESPACE { + +#if HAS_CUPTI_PROFILER + +class MockCuptiRBProfilerSession : public CuptiRBProfilerSession { + public: + MockCuptiRBProfilerSession(int deviceId, CUcontext ctx) + : CuptiRBProfilerSession(deviceId, ctx) {} + + void beginPass() override { + LOG(INFO) << " Mock CUPTI begin pass"; + passes_started++; + } + + bool endPass() override { + passes_ended++; + return true; + } + + void flushCounterData() override {} + + void pushRange(const std::string& rangeName) override { + LOG(INFO) << " Mock CUPTI pushrange ( " << rangeName << " )"; + ranges_started++; + } + + void popRange() override { + LOG(INFO) << " Mock CUPTI poprange"; + ranges_ended++; + } + + void stop() override { + runChecks(); + } + + void enable() override { + enabled = true; + } + void disable() override {} + + CuptiProfilerResult evaluateMetrics(bool /*verbose*/) override { + return result; + } + +protected: + void startInternal( + CUpti_ProfilerRange profilerRange, + CUpti_ProfilerReplayMode profilerReplayMode) override { + curRange_ = profilerRange; + curReplay_ = profilerReplayMode; + } + +private: + void runChecks() { + EXPECT_EQ(passes_started, passes_ended); + EXPECT_EQ(ranges_started, ranges_ended); + } + + public: + int passes_started = 0; + int passes_ended = 0; + int ranges_started = 0; + int ranges_ended = 0; + bool enabled = false; + + CuptiProfilerResult result; + +}; + +inline void simulateCudaContextCreate(CUcontext context, uint32_t dev) { + testing::trackCudaCtx( + context, dev, CUPTI_CBID_RESOURCE_CONTEXT_CREATED); +} + +inline void simulateCudaContextDestroy(CUcontext context, uint32_t dev) { + testing::trackCudaCtx( + context, dev, CUPTI_CBID_RESOURCE_CONTEXT_DESTROY_STARTING); +} + +inline void simulateKernelLaunch( + CUcontext context, const std::string& kernelName) { + testing::trackCudaKernelLaunch(context, kernelName.c_str()); +} + +#endif // HAS_CUPTI_PROFILER + +} // namespace KINETO_NAMESPACE diff --git a/tb_plugins/profiling/libkineto/test/CuptiStringsTest.cpp b/tb_plugins/profiling/libkineto/test/CuptiStringsTest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..405f9404a49a5bf8b7433930b0ad2fe898ea2d89 --- /dev/null +++ b/tb_plugins/profiling/libkineto/test/CuptiStringsTest.cpp @@ -0,0 +1,29 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include + +#include "src/cupti_strings.h" + +using namespace KINETO_NAMESPACE; + +TEST(CuptiStringsTest, Valid) { + ASSERT_STREQ( + runtimeCbidName(CUPTI_RUNTIME_TRACE_CBID_INVALID), "INVALID"); + ASSERT_STREQ( + runtimeCbidName(CUPTI_RUNTIME_TRACE_CBID_cudaDriverGetVersion_v3020), + "cudaDriverGetVersion"); + ASSERT_STREQ(runtimeCbidName + (CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSynchronize_v3020), + "cudaDeviceSynchronize"); + ASSERT_STREQ( + runtimeCbidName(CUPTI_RUNTIME_TRACE_CBID_cudaStreamSetAttribute_ptsz_v11000), + "cudaStreamSetAttribute_ptsz"); +} + +TEST(CuptiStringsTest, Invalid) { + ASSERT_STREQ(runtimeCbidName(-1), "INVALID"); + // We can't actually use CUPTI_RUNTIME_TRACE_CBID_SIZE here until we + // auto-generate the string table, since it may have more entries than + // the enum in the version used to compile. + ASSERT_STREQ(runtimeCbidName(1000), "INVALID"); +} diff --git a/tb_plugins/profiling/libkineto/test/EventProfilerTest.cpp b/tb_plugins/profiling/libkineto/test/EventProfilerTest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..cb36c826a7f32b2fe6732e73eae3b6a006b0cd3d --- /dev/null +++ b/tb_plugins/profiling/libkineto/test/EventProfilerTest.cpp @@ -0,0 +1,578 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include "src/EventProfiler.h" + +#include +#include +#include + +using namespace std::chrono; +using namespace KINETO_NAMESPACE; + +TEST(PercentileTest, Create) { + PercentileList pct = {{10, SampleValue(0)}, + {49, SampleValue(0)}, + {50, SampleValue(0)}, + {90, SampleValue(0)}}; + + percentiles({0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100}, pct); + EXPECT_EQ(pct[0].second.getInt(), 10); + EXPECT_EQ(pct[1].second.getInt(), 50); + EXPECT_EQ(pct[2].second.getInt(), 50); + EXPECT_EQ(pct[3].second.getInt(), 90); + + percentiles({80, 10, 20, 70, 60, 40, 90, 30, 50, 0, 100}, pct); + EXPECT_EQ(pct[0].second.getInt(), 10); + EXPECT_EQ(pct[1].second.getInt(), 50); + EXPECT_EQ(pct[2].second.getInt(), 50); + EXPECT_EQ(pct[3].second.getInt(), 90); + + percentiles({80}, pct); + EXPECT_EQ(pct[0].second.getInt(), 80); + EXPECT_EQ(pct[1].second.getInt(), 80); + EXPECT_EQ(pct[2].second.getInt(), 80); + EXPECT_EQ(pct[3].second.getInt(), 80); + + percentiles({80, 50}, pct); + EXPECT_EQ(pct[0].second.getInt(), 50); + EXPECT_EQ(pct[1].second.getInt(), 50); + EXPECT_EQ(pct[2].second.getInt(), 80); + EXPECT_EQ(pct[3].second.getInt(), 80); +} + +TEST(PercentileTest, Normalize) { + PercentileList pct = { + {10, SampleValue(10)}, {50, SampleValue(100.0)}, {90, SampleValue(2000)}}; + + normalize(pct, 2.5); + + EXPECT_EQ(pct[0].second.getInt(), 25); + EXPECT_EQ((int)pct[1].second.getDouble(), 250); + EXPECT_EQ(pct[2].second.getInt(), 5000); +} + +TEST(EventTest, SumSamples) { + Event ev; + ev.instanceCount = 4; + auto t = system_clock::now(); + ev.addSample(t, {1, 2, 3, 4}); + ev.addSample(t, {10, 20, 30, 40}); + ev.addSample(t, {100, 200, 300, 400}); + + EXPECT_EQ(ev.sumInstance(0, {0, 0, 3}), 1); + EXPECT_EQ(ev.sumInstance(0, {0, 1, 3}), 10); + EXPECT_EQ(ev.sumInstance(0, {0, 2, 3}), 100); + + EXPECT_EQ(ev.sumInstance(0, {0, 0, 1}), 111); + + EXPECT_EQ(ev.sumInstance(3, {0, 0, 1}), 444); + + // Non-zero offset + EXPECT_EQ(ev.sumInstance(0, {1, 0, 2}), 10); + EXPECT_EQ(ev.sumInstance(0, {1, 1, 2}), 100); + EXPECT_EQ(ev.sumInstance(0, {1, 0, 1}), 110); + + ev.addSample(t, {1000, 2000, 3000, 4000}); + + EXPECT_EQ(ev.sumInstance(0, {1, 0, 3}), 10); + EXPECT_EQ(ev.sumInstance(0, {1, 1, 3}), 100); + EXPECT_EQ(ev.sumInstance(0, {2, 1, 2}), 1000); + EXPECT_EQ(ev.sumInstance(0, {2, 0, 1}), 1100); + + EXPECT_EQ(ev.sumAll({0, 0, 4}), 10); + EXPECT_EQ(ev.sumAll({1, 0, 3}), 100); + EXPECT_EQ(ev.sumAll({2, 1, 2}), 10000); + EXPECT_EQ(ev.sumAll({0, 1, 2}), 11000); + EXPECT_EQ(ev.sumAll({0, 0, 1}), 11110); +} + +TEST(EventTest, Percentiles) { + Event ev; + ev.instanceCount = 4; + auto t = system_clock::now(); + ev.addSample(t, {3, 2, 1, 4}); + ev.addSample(t, {30, 20, 10, 40}); + ev.addSample(t, {300, 200, 100, 400}); + + PercentileList pct = { + {10, SampleValue(0)}, {50, SampleValue(0)}, {90, SampleValue(0)}}; + + ev.percentiles(pct, {0, 0, 3}); + EXPECT_EQ(pct[0].second.getInt(), 1); + EXPECT_EQ(pct[1].second.getInt(), 3); + EXPECT_EQ(pct[2].second.getInt(), 4); + + ev.percentiles(pct, {0, 0, 1}); + EXPECT_EQ(pct[0].second.getInt(), 111); + EXPECT_EQ(pct[1].second.getInt(), 333); + EXPECT_EQ(pct[2].second.getInt(), 444); +} + +class MockCuptiMetrics : public CuptiMetricApi { + public: + MockCuptiMetrics() : CuptiMetricApi(0) {} + MOCK_METHOD1(idFromName, CUpti_MetricID(const std::string& name)); + MOCK_METHOD1( + events, + std::map(CUpti_MetricID metric_id)); + MOCK_METHOD1(valueKind, CUpti_MetricValueKind(CUpti_MetricID metric)); + MOCK_METHOD1( + evaluationMode, + CUpti_MetricEvaluationMode(CUpti_MetricID metric)); + MOCK_METHOD5( + calculate, + SampleValue( + CUpti_MetricID metric, + CUpti_MetricValueKind kind, + std::vector& events, + std::vector& values, + int64_t duration)); +}; + +TEST(MetricTest, Calculate) { + using ::testing::Return; + MockCuptiMetrics metrics; + + // The events used for the ipc metrics: instructions and cycles + // Pretend we have 2 SMs and 2 samples of each event + Event instr("instructions"); + instr.instanceCount = 2; + auto t = system_clock::now(); + instr.addSample(t, {100, 200}); + instr.addSample(t, {300, 400}); + + Event cycles("cycles"); + cycles.instanceCount = 2; + cycles.addSample(t, {1000, 1200}); + cycles.addSample(t, {1300, 1300}); + + // 2 & 3 are the event ids we specified in the metric + std::map events; + events[2] = std::move(instr); + events[3] = std::move(cycles); + + // Define an ipc metric + EXPECT_CALL(metrics, valueKind(1)) + .Times(1) + .WillOnce(Return(CUPTI_METRIC_VALUE_KIND_DOUBLE)); + Metric m( + "ipc", 1, {2, 3}, CUPTI_METRIC_EVALUATION_MODE_PER_INSTANCE, metrics); + + // Calculate metric for first sample + // Since evaluation mode is CUPTI_METRIC_EVALUATION_MODE_PER_INSTANCE, + // Cupti API will be called three times: once for each SM (2) and once + // to get the total across SMs. + std::vector ids = {2, 3}; + std::vector vals = {100, 1000}; + EXPECT_CALL( + metrics, calculate(1, CUPTI_METRIC_VALUE_KIND_DOUBLE, ids, vals, 1000)) + .Times(1) + .WillOnce(Return(SampleValue(0.1))); + vals = {200, 1200}; + EXPECT_CALL( + metrics, calculate(1, CUPTI_METRIC_VALUE_KIND_DOUBLE, ids, vals, 1000)) + .Times(1) + .WillOnce(Return(SampleValue(0.17))); + vals = {300, 2200}; + EXPECT_CALL( + metrics, calculate(1, CUPTI_METRIC_VALUE_KIND_DOUBLE, ids, vals, 1000)) + .Times(1) + .WillOnce(Return(SampleValue(0.14))); + auto v = m.calculate(events, nanoseconds(1000), {0, 0, 2}); + + EXPECT_EQ(v.perInstance.size(), 2); + EXPECT_EQ(v.perInstance[0].getDouble(), 0.1); + EXPECT_EQ(v.perInstance[1].getDouble(), 0.17); + EXPECT_EQ(v.total.getDouble(), 0.14); + + // Calculate second sample. + // Change evaluation mode to CUPTI_METRIC_EVALUATION_MODE_AGGREGATE. + // Now we should get only one call to the Cupti API for the total. + EXPECT_CALL(metrics, valueKind(1)) + .Times(1) + .WillOnce(Return(CUPTI_METRIC_VALUE_KIND_DOUBLE)); + Metric m2("ipc", 1, {2, 3}, CUPTI_METRIC_EVALUATION_MODE_AGGREGATE, metrics); + vals = {700, 2600}; + EXPECT_CALL( + metrics, calculate(1, CUPTI_METRIC_VALUE_KIND_DOUBLE, ids, vals, 1000)) + .Times(1) + .WillOnce(Return(SampleValue(0.27))); + v = m2.calculate(events, nanoseconds(1000), {0, 1, 2}); + + EXPECT_EQ(v.perInstance.size(), 1); + EXPECT_EQ(v.perInstance[0].getDouble(), 0.27); + EXPECT_EQ(v.total.getDouble(), 0.27); +} + +class MockCuptiEvents : public CuptiEventApi { + public: + MOCK_METHOD1( + createGroupSets, + CUpti_EventGroupSets*(std::vector& ids)); + MOCK_METHOD1(destroyGroupSets, void(CUpti_EventGroupSets* sets)); + MOCK_METHOD0(setContinuousMode, bool()); + MOCK_METHOD1(enablePerInstance, void(CUpti_EventGroup eventGroup)); + MOCK_METHOD1(instanceCount, uint32_t(CUpti_EventGroup eventGroup)); + MOCK_METHOD1(enableGroupSet, void(CUpti_EventGroupSet& set)); + MOCK_METHOD1(disableGroupSet, void(CUpti_EventGroupSet& set)); + MOCK_METHOD3( + readEvent, + void(CUpti_EventGroup g, CUpti_EventID id, std::vector& vals)); + MOCK_METHOD1(eventsInGroup, std::vector(CUpti_EventGroup g)); + MOCK_METHOD1(eventId, CUpti_EventID(const std::string& name)); +}; + +TEST(EventGroupSetTest, CollectSample) { + using ::testing::_; + using ::testing::Return; + using ::testing::SetArgPointee; + const CUpti_EventGroup g1{nullptr}; + const CUpti_EventGroup g2{reinterpret_cast(0x1000)}; + CUpti_EventGroup groups[] = {g1, g2}; + CUpti_EventGroupSet set; + set.eventGroups = groups; + set.numEventGroups = 2; + + std::map events; + Event instr("instructions"); + events[4] = std::move(instr); + Event cycles("cycles"); + events[5] = std::move(cycles); + Event branches("branches"); + events[10] = std::move(branches); + + MockCuptiEvents cupti_events; + EXPECT_CALL(cupti_events, enablePerInstance(g1)).Times(1); + EXPECT_CALL(cupti_events, enablePerInstance(g2)).Times(1); + EXPECT_CALL(cupti_events, instanceCount(g1)).Times(1).WillOnce(Return(80)); + EXPECT_CALL(cupti_events, instanceCount(g2)).Times(1).WillOnce(Return(40)); + std::vector events_in_group1 = {4, 5}; + EXPECT_CALL(cupti_events, eventsInGroup(g1)) + .Times(1) + .WillOnce(Return(events_in_group1)); + std::vector events_in_group2 = {10}; + EXPECT_CALL(cupti_events, eventsInGroup(g2)) + .Times(1) + .WillOnce(Return(events_in_group2)); + EventGroupSet group_set(set, events, cupti_events); + + EXPECT_EQ(group_set.groupCount(), 2); + EXPECT_EQ(events[4].instanceCount, 80); + EXPECT_EQ(events[5].instanceCount, 80); + EXPECT_EQ(events[10].instanceCount, 40); + + // This should not cause any Cupti API action as the group + // set is already disabled + group_set.setEnabled(false); + + // Activate group set - if activated twice, only the first + // should cause cupti API to be called + EXPECT_CALL(cupti_events, enableGroupSet(_)).Times(1); + group_set.setEnabled(false); + group_set.setEnabled(true); + + EXPECT_CALL(cupti_events, eventsInGroup(g1)) + .Times(1) + .WillOnce(Return(events_in_group1)); + EXPECT_CALL(cupti_events, eventsInGroup(g2)) + .Times(1) + .WillOnce(Return(events_in_group2)); + EXPECT_CALL(cupti_events, readEvent(g1, 4, _)).Times(1); + EXPECT_CALL(cupti_events, readEvent(g1, 5, _)).Times(1); + EXPECT_CALL(cupti_events, readEvent(g2, 10, _)).Times(1); + group_set.collectSample(); + + EXPECT_EQ(events[4].sampleCount(), 1); + EXPECT_EQ(events[5].sampleCount(), 1); + EXPECT_EQ(events[10].sampleCount(), 1); +} + +class MockLogger : public SampleListener { + public: + MOCK_METHOD3(handleSample, void(int device, const Sample& sample, bool from_new_version)); + MOCK_METHOD1(update, void(const Config& config)); +}; + +class EventProfilerTest : public ::testing::Test { + protected: + void SetUp() override { + auto cupti_events_ptr = std::make_unique(); + auto cupti_metrics_ptr = std::make_unique(); + cuptiEvents_ = cupti_events_ptr.get(); + cuptiMetrics_ = cupti_metrics_ptr.get(); + loggers_.push_back(std::make_unique()); + onDemandLoggers_.push_back(std::make_unique()); + profiler_ = std::make_unique( + std::move(cupti_events_ptr), + std::move(cupti_metrics_ptr), + loggers_, + onDemandLoggers_); + + for (int i = 0; i < kEventGroupCount; i++) { + eventGroups_[i] = &eventGroups_[i]; + } + for (int i = 0; i < kGroupSetCount; i++) { + // Default size to 1 but can be changed by test + groupSet_[i].numEventGroups = 1; + // Two groups per set + groupSet_[i].eventGroups = &eventGroups_[i * 2]; + } + groupSets_.numSets = 1; + groupSets_.sets = groupSet_; + } + + MockCuptiEvents* cuptiEvents_; + MockCuptiMetrics* cuptiMetrics_; + std::vector> loggers_; + std::vector> onDemandLoggers_; + constexpr static int kEventGroupCount = 4; + constexpr static int kGroupSetCount = 2; + CUpti_EventGroup eventGroups_[kEventGroupCount]; + CUpti_EventGroupSet groupSet_[kGroupSetCount]; + CUpti_EventGroupSets groupSets_; + std::unique_ptr profiler_; +}; + +TEST_F(EventProfilerTest, ConfigureFailure) { + using namespace testing; + + // Default config has no counters enabled. + // Check that profiler remains disabled. + Config cfg; + profiler_->configure(cfg, nullptr); + + EXPECT_FALSE(profiler_->enabled()); + + // There is no event named "cycles" + // In this case the profiler should print a warning and remain disabled + bool parsed = cfg.parse("EVENTS = cycles"); + EXPECT_TRUE(parsed); + + // EventProfiler should handle exception thrown from createGroupSets + // Configuration will be applied twice - once for combined base + on-demand + // and then again falling back to base + EXPECT_CALL(*cuptiEvents_, eventId("cycles")) + .Times(2) + .WillRepeatedly(Return(0)); + std::vector ids = {0}; + EXPECT_CALL(*cuptiEvents_, createGroupSets(ids)) + .Times(2) + .WillRepeatedly(Throw( + std::system_error(EINVAL, std::generic_category(), "Event ID"))); + profiler_->configure(cfg, nullptr); + + EXPECT_FALSE(profiler_->enabled()); +} + +TEST_F(EventProfilerTest, ConfigureBase) { + using namespace testing; + + // Test normal path, simple base config + Config cfg; + bool parsed = cfg.parse("EVENTS = elapsed_cycles_sm"); + EXPECT_TRUE(parsed); + + // One valid event - expect one call to eventId and createGroupSets + EXPECT_CALL(*cuptiEvents_, eventId("elapsed_cycles_sm")) + .Times(1) + .WillOnce(Return(5)); + std::vector ids = {5}; + EXPECT_CALL(*cuptiEvents_, createGroupSets(ids)) + .Times(1) + .WillOnce(Return(&groupSets_)); + EXPECT_CALL(*cuptiEvents_, enablePerInstance(eventGroups_[0])).Times(1); + EXPECT_CALL(*cuptiEvents_, instanceCount(eventGroups_[0])) + .Times(1) + .WillOnce(Return(80)); + EXPECT_CALL(*cuptiEvents_, eventsInGroup(eventGroups_[0])) + .Times(1) + .WillOnce(Return(ids)); + EXPECT_CALL(*cuptiEvents_, enableGroupSet(_)).Times(1); + + profiler_->configure(cfg, nullptr); + + EXPECT_TRUE(profiler_->enabled()); +} + +TEST_F(EventProfilerTest, ConfigureOnDemand) { + using namespace testing; + + // Test base + on-demand config, one event and one metric + Config cfg, on_demand_cfg; + bool parsed = cfg.parse(R"( + EVENTS = active_cycles + SAMPLE_PERIOD_MSECS=500 + REPORT_PERIOD_SECS=10 + SAMPLES_PER_REPORT=5 + )"); + EXPECT_TRUE(parsed); + + parsed = on_demand_cfg.parse(R"( + METRICS = ipc + EVENTS_DURATION_SECS=60 + SAMPLE_PERIOD_MSECS=200 + MULTIPLEX_PERIOD_MSECS=2000 + REPORT_PERIOD_SECS=3 + SAMPLES_PER_REPORT=10 + )"); + EXPECT_TRUE(parsed); + + // One event + EXPECT_CALL(*cuptiEvents_, eventId("active_cycles")) + .Times(1) + .WillOnce(Return(3)); + // One metric + EXPECT_CALL(*cuptiMetrics_, idFromName("ipc")).Times(1).WillOnce(Return(10)); + std::map ipc_events; + ipc_events[4] = "instructions"; + ipc_events[5] = "elapsed_cycles_sm"; + EXPECT_CALL(*cuptiMetrics_, events(10)).Times(1).WillOnce(Return(ipc_events)); + EXPECT_CALL(*cuptiMetrics_, evaluationMode(10)) + .Times(1) + .WillOnce(Return(CUPTI_METRIC_EVALUATION_MODE_PER_INSTANCE)); + EXPECT_CALL(*cuptiMetrics_, valueKind(10)) + .Times(1) + .WillOnce(Return(CUPTI_METRIC_VALUE_KIND_DOUBLE)); + std::vector ids = {3, 4, 5}; + groupSet_[0].numEventGroups = 2; + groupSets_.numSets = 2; + EXPECT_CALL(*cuptiEvents_, createGroupSets(ids)) + .Times(1) + .WillOnce(Return(&groupSets_)); + // Specified CUPTI_METRIC_EVALUATION_MODE_PER_INSTANCE per instance above + // So check that it's enabled + EXPECT_CALL(*cuptiEvents_, enablePerInstance(eventGroups_[0])).Times(1); + EXPECT_CALL(*cuptiEvents_, enablePerInstance(eventGroups_[1])).Times(1); + EXPECT_CALL(*cuptiEvents_, enablePerInstance(eventGroups_[2])).Times(1); + std::vector ids_g1{3}, ids_g2{4}, ids_g3{5}; + EXPECT_CALL(*cuptiEvents_, eventsInGroup(eventGroups_[0])) + .Times(1) + .WillOnce(Return(ids_g1)); + EXPECT_CALL(*cuptiEvents_, eventsInGroup(eventGroups_[1])) + .Times(1) + .WillOnce(Return(ids_g2)); + EXPECT_CALL(*cuptiEvents_, eventsInGroup(eventGroups_[2])) + .Times(1) + .WillOnce(Return(ids_g3)); + EXPECT_CALL(*cuptiEvents_, enableGroupSet(_)).Times(1); + + profiler_->configure(cfg, &on_demand_cfg); + + EXPECT_TRUE(profiler_->enabled()); + EXPECT_EQ(profiler_->samplePeriod().count(), 250); + EXPECT_EQ(profiler_->multiplexPeriod().count(), 1000); + EXPECT_EQ(profiler_->reportPeriod().count(), 10000); + EXPECT_EQ(profiler_->onDemandReportPeriod().count(), 4000); +} + +TEST_F(EventProfilerTest, ReportSample) { + using namespace testing; + + // Test base + on-demand config, one event and one metric + Config cfg, on_demand_cfg; + bool parsed = cfg.parse("EVENTS = active_cycles"); + EXPECT_TRUE(parsed); + + parsed = on_demand_cfg.parse(R"( + METRICS = ipc + EVENTS_DURATION_SECS=60 + )"); + EXPECT_TRUE(parsed); + + // One event + EXPECT_CALL(*cuptiEvents_, eventId("active_cycles")) + .Times(1) + .WillOnce(Return(3)); + // One metric + EXPECT_CALL(*cuptiMetrics_, idFromName("ipc")).Times(1).WillOnce(Return(10)); + std::map ipc_events; + ipc_events[4] = "instructions"; + ipc_events[5] = "elapsed_cycles_sm"; + EXPECT_CALL(*cuptiMetrics_, events(10)).Times(1).WillOnce(Return(ipc_events)); + EXPECT_CALL(*cuptiMetrics_, evaluationMode(10)) + .Times(1) + .WillOnce(Return(CUPTI_METRIC_EVALUATION_MODE_PER_INSTANCE)); + EXPECT_CALL(*cuptiMetrics_, valueKind(10)) + .Times(1) + .WillOnce(Return(CUPTI_METRIC_VALUE_KIND_DOUBLE)); + std::vector ids = {3, 4, 5}; + groupSet_[0].numEventGroups = 2; + groupSets_.numSets = 2; + EXPECT_CALL(*cuptiEvents_, createGroupSets(ids)) + .Times(1) + .WillOnce(Return(&groupSets_)); + EXPECT_CALL(*cuptiEvents_, instanceCount(_)) + .Times(3) + .WillRepeatedly(Return(4)); + std::vector ids_g1{3}, ids_g2{4}, ids_g3{5}; + // These will be called by collectSample() as well, which is called twice + // per group set + EXPECT_CALL(*cuptiEvents_, eventsInGroup(eventGroups_[0])) + .Times(3) + .WillRepeatedly(Return(ids_g1)); + EXPECT_CALL(*cuptiEvents_, eventsInGroup(eventGroups_[1])) + .Times(3) + .WillRepeatedly(Return(ids_g2)); + EXPECT_CALL(*cuptiEvents_, eventsInGroup(eventGroups_[2])) + .Times(3) + .WillRepeatedly(Return(ids_g3)); + EXPECT_CALL(*cuptiEvents_, enableGroupSet(_)).Times(1); + + profiler_->configure(cfg, &on_demand_cfg); + + EXPECT_TRUE(profiler_->enabled()); + + EXPECT_CALL(*cuptiEvents_, readEvent(_, _, _)) + .Times(6) + .WillRepeatedly(Invoke( + [](CUpti_EventGroup g, CUpti_EventID id, std::vector& vals) { + vals = {1, 2, 3, 4}; + })); + + // Need to collect four times - twice for each group set + profiler_->collectSample(); + profiler_->collectSample(); + EXPECT_CALL(*cuptiEvents_, disableGroupSet(_)).Times(1); + EXPECT_CALL(*cuptiEvents_, enableGroupSet(_)).Times(1); + profiler_->enableNextCounterSet(); + profiler_->collectSample(); + profiler_->collectSample(); + + std::vector ipc_ids = {4, 5}; + // Called once for each instance (4) and once for the total. + // x2 since we recompute per logger. + EXPECT_CALL( + *cuptiMetrics_, + calculate(10, CUPTI_METRIC_VALUE_KIND_DOUBLE, ipc_ids, _, 2000000000)) + .Times(10) + .WillRepeatedly(Return(SampleValue(0.3))); + auto& logger = dynamic_cast(*loggers_[0]); + EXPECT_CALL(logger, handleSample(0, _, _)) + .Times(1) + .WillOnce(Invoke([](int device, const Sample& sample, bool from_new_version) { + // Sample will include all stats - logger must pick the + // ones it wants. + EXPECT_EQ(sample.stats.size(), 4); + EXPECT_EQ(sample.stats[0].name, "active_cycles"); + EXPECT_EQ(sample.stats[1].name, "instructions"); + EXPECT_EQ(sample.stats[2].name, "elapsed_cycles_sm"); + EXPECT_EQ(sample.stats[3].name, "ipc"); + // 2 samples, each with values {1, 2, 3, 4} + // i.e. {2, 4, 6, 8} total + EXPECT_EQ(sample.stats[0].total.getInt(), 20); + EXPECT_EQ(sample.stats[0].percentileValues[0].second.getInt(), 2); + EXPECT_EQ(sample.stats[0].percentileValues.back().second.getInt(), 8); + // ipc is always 0.3 from mocked calculate function above + EXPECT_EQ(sample.stats[3].total.getDouble(), 0.3); + EXPECT_EQ(sample.stats[3].percentileValues[0].second.getDouble(), 0.3); + EXPECT_EQ( + sample.stats[3].percentileValues.back().second.getDouble(), 0.3); + })); + profiler_->reportSamples(); + + auto& on_demand_logger = dynamic_cast(*onDemandLoggers_[0]); + EXPECT_CALL(on_demand_logger, handleSample(0, _, _)).Times(1); + profiler_->reportOnDemandSamples(); + + EXPECT_CALL(*cuptiEvents_, disableGroupSet(_)).Times(1); +} diff --git a/tb_plugins/profiling/libkineto/test/LoggerObserverTest.cpp b/tb_plugins/profiling/libkineto/test/LoggerObserverTest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..30ba4a824af10401a45100b0b39cec54fcf98680 --- /dev/null +++ b/tb_plugins/profiling/libkineto/test/LoggerObserverTest.cpp @@ -0,0 +1,96 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include +#include + +// TODO(T90238193) +// @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude +#include "include/libkineto.h" +#include "src/Logger.h" +#include "LoggerCollector.h" + +using namespace KINETO_NAMESPACE; + +#if !USE_GOOGLE_LOG + +constexpr char InfoTestStr[] = "Checking LOG(INFO)"; +constexpr char WarningTestStr[] = "Checking LOG(WARNING)"; +constexpr char ErrorTestStr[] = "Checking LOG(ERROR)"; + +TEST(LoggerObserverTest, SingleCollectorObserver) { + // Add a LoggerObserverCollector to collect all logs during the trace. + std::unique_ptr lCollector = std::make_unique(); + Logger::addLoggerObserver(lCollector.get()); + + LOG(INFO) << InfoTestStr; + LOG(WARNING) << WarningTestStr; + LOG(ERROR) << ErrorTestStr; + + auto LoggerMD = lCollector->extractCollectorMetadata(); + EXPECT_TRUE(LoggerMD[LoggerOutputType::INFO][0].find(InfoTestStr) != std::string::npos); + EXPECT_TRUE(LoggerMD[LoggerOutputType::WARNING][0].find(WarningTestStr) != std::string::npos); + EXPECT_TRUE(LoggerMD[LoggerOutputType::ERROR][0].find(ErrorTestStr) != std::string::npos); + + Logger::removeLoggerObserver(lCollector.get()); +} + +#define NUM_OF_MESSAGES_FOR_EACH_TYPE 10 +#define NUM_OF_WRITE_THREADS 200 + +// Writes NUM_OF_MESSAGES_FOR_EACH_TYPE messages for each INFO, WARNING, and ERROR. +// NOLINTNEXTLINE(clang-diagnostic-unused-parameter) +void* writeSeveralMessages(void* ptr) { + for(int i=0; i lc1 = std::make_unique(); + std::unique_ptr lc2 = std::make_unique(); + std::unique_ptr lc3 = std::make_unique(); + std::unique_ptr lc4 = std::make_unique(); + Logger::addLoggerObserver(lc1.get()); + Logger::addLoggerObserver(lc2.get()); + Logger::addLoggerObserver(lc3.get()); + Logger::addLoggerObserver(lc4.get()); + + // Launch NUM_OF_WRITE_THREADS threads writing several messages. + pthread_t ListOfThreads[NUM_OF_WRITE_THREADS]; + for (int i=0; iextractCollectorMetadata(); + int InfoCount = 0, WarnCount = 0, ErrorCount = 0; + for (auto& md : lc1MD) { + InfoCount += md.first == LoggerOutputType::INFO ? md.second.size() : 0; + WarnCount += md.first == LoggerOutputType::WARNING ? md.second.size() : 0; + ErrorCount += md.first == LoggerOutputType::ERROR ? md.second.size() : 0; + } + + EXPECT_EQ(InfoCount, NUM_OF_WRITE_THREADS * NUM_OF_MESSAGES_FOR_EACH_TYPE); + EXPECT_EQ(WarnCount, NUM_OF_WRITE_THREADS * NUM_OF_MESSAGES_FOR_EACH_TYPE); + EXPECT_EQ(ErrorCount, NUM_OF_WRITE_THREADS * NUM_OF_MESSAGES_FOR_EACH_TYPE); + + Logger::removeLoggerObserver(lc1.get()); + Logger::removeLoggerObserver(lc2.get()); + Logger::removeLoggerObserver(lc3.get()); + Logger::removeLoggerObserver(lc4.get()); +} + +#endif // !USE_GOOGLE_LOG + +int main(int argc, char **argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/tb_plugins/profiling/libkineto/test/MockActivitySubProfiler.cpp b/tb_plugins/profiling/libkineto/test/MockActivitySubProfiler.cpp new file mode 100644 index 0000000000000000000000000000000000000000..89f1d536ca8d6d794b7ffc7402001d0e3d4d9c06 --- /dev/null +++ b/tb_plugins/profiling/libkineto/test/MockActivitySubProfiler.cpp @@ -0,0 +1,49 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include +#include +#include + +#include "test/MockActivitySubProfiler.h" + +namespace libkineto { + +const std::set supported_activities {ActivityType::CPU_OP}; +const std::string profile_name{"MockProfiler"}; + +void MockProfilerSession::processTrace(ActivityLogger& logger) { + for (const auto& activity: activities()) { + activity.log(logger); + } +} + +const std::string& MockActivityProfiler::name() const { + return profile_name; +} + +const std::set& MockActivityProfiler::availableActivities() const { + return supported_activities; +} + +MockActivityProfiler::MockActivityProfiler( + std::vector& activities) : + test_activities_(activities) {}; + +std::unique_ptr MockActivityProfiler::configure( + const std::set& /*activity_types*/, + const Config& /*config*/) { + auto session = std::make_unique(); + session->set_test_activities(std::move(test_activities_)); + return session; +}; + +std::unique_ptr MockActivityProfiler::configure( + int64_t /*ts_ms*/, + int64_t /*duration_ms*/, + const std::set& activity_types, + const Config& config) { + return configure(activity_types, config); +}; + +} // namespace libkineto + diff --git a/tb_plugins/profiling/libkineto/test/MockActivitySubProfiler.h b/tb_plugins/profiling/libkineto/test/MockActivitySubProfiler.h new file mode 100644 index 0000000000000000000000000000000000000000..36eaa13d1a544c624a2f4bb053891d055686ebf4 --- /dev/null +++ b/tb_plugins/profiling/libkineto/test/MockActivitySubProfiler.h @@ -0,0 +1,72 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#include +#include +#include + +#include "include/IActivityProfiler.h" + +namespace libkineto { + +class MockProfilerSession: public IActivityProfilerSession { + + public: + explicit MockProfilerSession() {} + + void start() override { + start_count++; + status_ = TraceStatus::RECORDING; + } + + void stop() override { + stop_count++; + status_ = TraceStatus::PROCESSING; + } + + std::vector& activities() override { + return test_activities_; + } + + std::vector errors() override { + return {}; + } + + void processTrace(ActivityLogger& logger) override; + + void set_test_activities(std::vector&& acs) { + test_activities_ = std::move(acs); + } + + int start_count = 0; + int stop_count = 0; + private: + std::vector test_activities_; +}; + + +class MockActivityProfiler: public IActivityProfiler { + + public: + explicit MockActivityProfiler(std::vector& activities); + + const std::string& name() const override; + + const std::set& availableActivities() const override; + + std::unique_ptr configure( + const std::set& activity_types, + const Config& config) override; + + std::unique_ptr configure( + int64_t ts_ms, + int64_t duration_ms, + const std::set& activity_types, + const Config& config) override; + + private: + std::vector test_activities_; +}; + +} // namespace libkineto diff --git a/tb_plugins/profiling/libkineto/test/PidInfoTest.cpp b/tb_plugins/profiling/libkineto/test/PidInfoTest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b86cfb36d0581ba9a8a03a09724b181c2fd2e88a --- /dev/null +++ b/tb_plugins/profiling/libkineto/test/PidInfoTest.cpp @@ -0,0 +1,27 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include "include/ThreadUtil.h" + +#include +#include + +#include +#include + +using namespace KINETO_NAMESPACE; + +TEST(ThreadNameTest, setAndGet) { + setThreadName("ThreadNameTest"); + EXPECT_EQ(getThreadName(), "ThreadNameTest"); + + setThreadName(""); + EXPECT_EQ(getThreadName(), ""); + + // Spaces etc are ok + setThreadName("Name w/ spaces"); + EXPECT_EQ(getThreadName(), "Name w/ spaces"); + + // More than 16 chars is not OK + setThreadName("More than 16 characters"); + EXPECT_EQ(getThreadName(), "Name w/ spaces"); +} diff --git a/tb_plugins/profiling/tb_plugin/.flake8 b/tb_plugins/profiling/tb_plugin/.flake8 new file mode 100644 index 0000000000000000000000000000000000000000..1c5254b9f84568ba37d21c8e77e803558a6dae54 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/.flake8 @@ -0,0 +1,3 @@ +[flake8] +max-line-length = 120 +per-file-ignores = __init__.py:F401 torch_tb_profiler/io/file.py: F401 diff --git a/tb_plugins/profiling/tb_plugin/.gitignore b/tb_plugins/profiling/tb_plugin/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..dc7d4e6278beafdf41c8304cbc5915b3334095b3 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/.gitignore @@ -0,0 +1,4 @@ +/build +/dist +/*.egg-info +__pycache__ diff --git a/tb_plugins/profiling/tb_plugin/.pre-commit-config.yaml b/tb_plugins/profiling/tb_plugin/.pre-commit-config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a650ec83269e596e6a4634045da8a0eff17830b9 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/.pre-commit-config.yaml @@ -0,0 +1,34 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# ------------------------------------------------------------------------- +default_language_version: + python: python3.8 + +ci: + autofix_prs: true + autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions' + autoupdate_schedule: quarterly + # submodules: true + +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.1.0 + hooks: + - id: end-of-file-fixer + exclude: torch_tb_profiler/static/index.html + - id: trailing-whitespace + - id: double-quote-string-fixer + + - repo: https://github.com/pre-commit/mirrors-autopep8 + rev: v1.6.0 + hooks: + - id: autopep8 + name: Format code + - repo: https://github.com/PyCQA/flake8 + rev: 4.0.1 + hooks: + - id: flake8 + args: + - "--max-line-length=120" + - "--per-file-ignores=__init__.py:F401 tb_plugin/torch_tb_profiler/io/file.py: F401" + name: Check PEP8 diff --git a/tb_plugins/profiling/tb_plugin/LICENSE b/tb_plugins/profiling/tb_plugin/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..edb179715b5213644cfe903d43294f54892e707e --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/LICENSE @@ -0,0 +1,33 @@ +BSD License + +For Kineto software + +Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. + +All contributions by Microsoft: +Copyright (c) Microsoft Corporation. (The Azure AI Platform team) + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + * Neither the name Facebook nor the names of its contributors may be used to + endorse or promote products derived from this software without specific + prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/tb_plugins/profiling/tb_plugin/README.md b/tb_plugins/profiling/tb_plugin/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e3c00875d5df3d43b541ea0ce35faae6d7da58c9 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/README.md @@ -0,0 +1,478 @@ +# PyTorch Profiler TensorBoard Plugin + +This is a TensorBoard Plugin that provides visualization of PyTorch profiling. +It can parse, process and visualize the PyTorch Profiler's dumped profiling result, +and give optimization recommendations. + +### Quick Installation Instructions + +* Install from pypi + + `pip install torch-tb-profiler` + +* Or you can install from source + + Clone the git repository: + + `git clone https://github.com/pytorch/kineto.git` + + Navigate to the `kineto/tb_plugin` directory. + + Install with command: + + `pip install .` + +* Build the wheel + - `python setup.py build_fe sdist bdist_wheel` \ + **_Note_**: the build_fe step need setup yarn and Node.js + - `python setup.py sdist bdist_wheel` + +### Quick Start Instructions + +* Prepare profiling data + + We have prepared some sample profiling data at [kineto/tb_plugin/samples](./samples) + You can download it directly. + Or you can generate these profiling samples yourself by running + [kineto/tb_plugin/examples/resnet50_profiler_api.py](./examples/resnet50_profiler_api.py). + Also you can learn how to profile your model and generate profiling data from [PyTorch Profiler](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html?highlight=tensorboard). + + Note: The recommended way to produce profiling data is assigning `torch.profiler.tensorboard_trace_handler` + to `on_trace_ready` on creation of `torch.profiler.profile`. + +* Start TensorBoard + + Specify the profiling data folder to `logdir` in TensorBoard. If you use the above samples data, start TensorBoard with: + + `tensorboard --logdir=./samples` + + If your web browser is not in the same machine that you start TensorBoard, + you can add `--bind_all` option, such as: + + `tensorboard --logdir=./samples --bind_all` + + Note: Make sure the default port 6006 is open to the browser's host. + +* Open TensorBoard in Chrome browser + + Open URL `http://localhost:6006` in the browser. + If you use `--bind_all` in tensorboard start command, the hostname may not be 'localhost'. You may find it in the log printed after the cmd. + +* Navigate to the PYTORCH_PROFILER tab + + If the files under `--logdir` are too big or too many, + please wait a while and refresh the browser to check latest loaded result. + +* Loading profiling data from the cloud + * AWS S3 (S3://) + + Install `boto3`. Set environment variables: `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`. Optionally, `S3_ENDPOINT` can be set as well.\ + For minio, the S3 url should start with the bucket name `s3:////` instead of minio prefix `s3://minio//`. At the same time, the `S3_ENDPOINT` is needed as well. \ + Follow these guides to get set-up with minio: + * Server: https://docs.min.io/docs/minio-quickstart-guide.html + * MC Client: https://docs.min.io/docs/minio-client-quickstart-guide.html + + For example, the following commands can be used to create minio storage: + ```bash + ./mc alias set s3 http://10.150.148.189:9000 minioadmin minioadmin + ./mc mb s3/profiler --region=us-east-1 + ./mc cp ~/notebook/version_2 s3/profiler/ --recursive + export AWS_ACCESS_KEY_ID=minioadmin + export AWS_SECRET_ACCESS_KEY=minioadmin + export AWS_REGION=us-east-1 + export S3_USE_HTTPS=0 + export S3_VERIFY_SSL=0 + export S3_ENDPOINT=http://localhost:9000 + tensorboard --logdir=s3://profiler/version_2/ --bind_all + ``` + + * Azure Blob (https://\.blob.core.windows.net) + + Install `azure-storage-blob`. Optionally, set environment variable `AZURE_STORAGE_CONNECTION_STRING`. + + * Google Cloud (GS://) + + Install `google-cloud-storage`. + + --- + > **_NOTES:_** For AWS S3, Google Cloud and Azure Blob, the trace files need to be put on a top level folder under bucket/container. + --- + + We prepared some sample data in blob, you can also access it using the command + + tensorboard --logdir=https://torchtbprofiler.blob.core.windows.net/torchtbprofiler/demo/ --bind_all + + and open TensorBoard your browser to see all the views described below. + + Note: for accessing data in Azure Blob, you need to install torch-tb-profiler with `pip install torch-tb-profiler[blob]` + +### Quick Usage Instructions + +We regard each running with profiler enabled as a "run". +In most cases a run is a single process. If DDP is enabled, then a run includes multiple processes. +We name each process a "worker". + +Each run corresponds to a sub-folder under the folder specified by "--logdir". +Each sub-folder contains one or more chrome trace files, one for each process. +The kineto/tb_plugin/samples is an example of how the files are organized. + +You can select the run and worker on the left control panel. + +![Alt text](./docs/images/control_panel.PNG) + +Runs: Select a run. Each run is one execution of a PyTorch application with profiling enabled. + +Views: We organize the profiling result into multiple views, +from coarse-grained (overview-level) to fine-grained (kernel-level). + +Workers: Select a worker. Each worker is a process. There could be multiple workers when DDP is used. + +Span: There may be multiple profiling trace files of different spans to be generated when using [torch.profiler.schedule](https://github.com/pytorch/pytorch/blob/master/torch/profiler/profiler.py#L24) as schedule of torch.profiler. +You can select them with this selection box. + +Currently we have the following performance diagnosis views: +- Overall View +- Operator View +- Kernel View +- Trace View +- Memory View +- Distributed View + +We describe each of these views below. + +* Overall View + + The overall view is a top level view of the process in your profiling run. + It shows an overview of time cost, including both host and GPU devices. + You can select the current worker in the left panel's "Workers" dropdown menu. + + An example of overall view: + ![Alt text](./docs/images/overall_view.PNG) + + The 'GPU Summary' panel shows GPU information and usage metrics of this run, include name, global memory, compute capability of this GPU. + The 'GPU Utilization', 'Est. SM Efficiency' and 'Est. Achieved Occupancy' shows GPU usage efficiency of this run at different levels. + The 'Kernel Time using Tensor Cores' shows percent of the time Tensor Core kernels are active. + The detailed information about the above four metrics can be found at [gpu_utilization](./docs/gpu_utilization.md). + + The 'Step Time Breakdown' panel shows the performance summary. We regard each iteration (usually a mini-batch) as a step. + The time spent on each step is broken down into multiple categories as follows: + + 1. Kernel: Kernels execution time on GPU device; + + 2. Memcpy: GPU involved memory copy time (either D2D, D2H or H2D); + + 3. Memset: GPU involved memory set time; + + 4. Communication: Communication time only appear in DDP case; + + 5. Runtime: CUDA runtime execution time on host side; + Such as cudaLaunchKernel, cudaMemcpyAsync, cudaStreamSynchronize, ... + + 6. DataLoader: The data loading time spent in PyTorch DataLoader object; + + 7. CPU Exec: Host compute time, including every PyTorch operator running time; + + 8. Other: The time not included in any of the above. + + Note: The summary of all the above categories is end-to-end wall-clock time. + + The above list is ranked by priority from high to low. We count time in priority order. + The time cost with highest priority category(Kernel) is counted first, + then Memcpy, then Memset, ..., and Other is counted last. + In the following example, the "Kernel" is counted first as 7-2=5 seconds; + Then the "Memcpy" is counted as 0 seconds, because it is fully hidden by "Kernel"; + Then "CPU Exec" is counted as 2-1=1 seconds, because the [2,3] interval is hidden by "Kernel", only [1,2] interval is counted. + + In this way, summarization of all the 7 categories' counted time in a step + will be the same with this step's total wall clock time. + + ![Alt text](./docs/images/time_breakdown_priority.PNG) + + Performance Recommendation: Leverage the profiling result to automatically highlight likely bottlenecks, + and give users actionable optimization suggestions. + +* Operator View + + This view displays the performance of every PyTorch operator that is executed either on the host or device. + + ![Alt text](./docs/images/operator_view.PNG) + Each table row is a PyTorch operator, which is a computation operator implemented by C++, + such as "aten::relu_", "aten::convolution". + + Calls: How many times the operator is called in this run. + + Device Self Duration: The accumulated time spent on GPU, not including this operator’s child operators. + + Device Total Duration: The accumulated time spent on GPU, including this operator’s child operators. + + Host Self Duration: The accumulated time spent on Host, not including this operator’s child operators. + + Host Total Duration: The accumulated time spent on Host, including this operator’s child operators. + + Tensor Cores Eligible: Whether this operator is eligible to use Tensor Cores. + + Tensor Cores Self (%): Time of self-kernels with Tensor Cores / Time of self-kernels. + Self-kernels don't include kernels launched by this operator’s child operators. + + Tensor Cores Total (%): Time of kernels with Tensor Cores / Time of kernels. + + CallStack: All call stacks of this operator if it has been recorded in profiling trace file. + To dump this call stack information, you should set the 'with_stack' parameter in torch.profiler API. + The TensorBoard has integrated to VSCode, if you launch TensorBoard in VSCode, clicking this CallStack will forward to corresponding line of source code as below: + + ![Alt text](./docs/images/vscode_stack.PNG) + + Note: Each above duration means wall-clock time. It doesn't mean the GPU or CPU during this period is fully utilized. + + The top 4 pie charts are visualizations of the above 4 columns of durations. + They make the breakdowns visible at a glance. + Only the top N operators sorted by duration (configurable in the text box) will be shown in the pie charts. + + The search box enables searching operators by name. + + "Group By" could choose between "Operator" and "Operator + Input Shape". + The "Input Shape" is shapes of tensors in this operator’s input argument list. + The empty "[]" means argument with scalar type. + For example, "[[32, 256, 14, 14], [1024, 256, 1, 1], [], [], [], [], [], [], []]" + means this operator has 9 input arguments, + 1st is a tensor of size 32\*256\*14\*14, + 2nd is a tensor of size 1024\*256\*1\*1, + the following 7 ones are scalar variables. + + ![Alt text](./docs/images/operator_view_group_by_inputshape.PNG) + +* Kernel View + + This view shows all kernels’ time spent on GPU. + The time is calculated by subtracting the kernel's start time from the end time. + + Note: This view does not include cudaMemcpy or cudaMemset. Because they are not kernels. + + ![Alt text](./docs/images/kernel_view.PNG) + + * Tensor Cores Used: Whether this kernel uses Tensor Cores. + + * Total Duration: The accumulated time of all calls of this kernel. + + * Mean Duration: The average time duration of all calls. That's "Total Duration" divided by "Calls". + + * Max Duration: The maximum time duration among all calls. + + * Min Duration: The minimum time duration among all calls. + + Note: These durations only include a kernel's elapsed time on GPU device. + It does not mean the GPU is fully busy executing instructions during this time interval. + Some of the GPU cores may be idle due to reasons such as memory access latency or insufficient parallelism. + For example, there may be insufficient number of available warps per SM for the GPU to effectively + hide memory access latencies, or some SMs may be entirely idle due to an insufficient number of blocks. + Please refer to [Nvidia's best-practices guide](https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html). + To investigate efficiency for each kernel, we calculate and show the 'Mean Blocks Per SM' and 'Mean Est. Achieved Occupancy' in the last two column. + + * Mean Blocks Per SM: Blocks per SM = Blocks of this kernel / SM number of this GPU. If this number is less than 1, it indicates the GPU multiprocessors are not fully utilized. "Mean Blocks per SM" is weighted average of all runs of this kernel name, using each run’s duration as weight. + + * Mean Est. Achieved Occupancy: The definition of Est. Achieved Occupancy can refer to [gpu_utilization](./docs/gpu_utilization.md), It is weighted average of all runs of this kernel name, using each run’s duration as weight. + + The top left pie chart is a visualization of "Total Duration" column. + It makes the breakdowns visible at a glance. + Only the top N kernels sorted by accumulated time (configurable in the text box) will be shown in the pie chart. + + The top right pie chart is percent of the kernel time using and without using Tensor Cores. + + The search box enables searching kernels by name. + + "Group By" could choose between "Kernel Name" and "Kernel Properties + Op Name". + + "Kernel Name" will group kernels by kernel name. + + "Kernel Properties + Op Name" will group kernels by combination of kernel name, launching operator name, + grid, block, registers per thread, and shared memory. + + ![Alt text](./docs/images/trace_view.PNG) + + * Operator: The name of PyTorch operator which launches this kernel. + + * Grid: Grid size of this kernel. + + * Block: Block size of this kernel. + + * Register Per Thread: Number of registers required for each thread executing the kernel. + + * Shared Memory: Sum of dynamic shared memory reserved, and static shared memory allocated for this kernel. + +* Trace View + + This view shows timeline using the chrome tracing plugin. Each horizontal area represents a thread or a CUDA stream. + Each colored rectangle represents an operator, or a CUDA runtime, or a GPU op which executes on GPU + (such as a kernel, a CUDA memory copy, a CUDA memory set, ...) + + ![Alt text](./docs/images/trace_view.PNG) + + In the above example: + + The "thread 25772" is the CPU thread that do "backward" of neural network. + + The "thread 25738" is the main CPU thread, which mainly do data loading, forward of neural network, and model update. + + The "stream 7" is a CUDA stream, which shows all kernels of this stream. + + You can see there are 6 "ProfilerStep" at the top of "thread 1". Each "ProfilerStep" represents a mini-batch step. + + The suspended toolbar has functionalities to help view the trace line. + For example, when the up-down arrow is enabled, + you can zoom in by dragging the mouse up and keeping mouse's left button pushed down. + + ![Alt text](./docs/images/trace_view_one_step.PNG) + + The "Optimizer.step#SGD.step" and "enumerate(DataLoader)#_SingleProcessDataLoaderIter.\__next\__" + are high-level python side functions. + + When you select the top-right corner's "Flow events" to "async", + you can see the relationship between an operator and its launched kernels. + ![Alt text](./docs/images/trace_view_launch.PNG) + + You can also view the gpu utilization and Est. SM Efficiency in the trace view. They are drawn alongside the timeline: + + ![Alt text](./docs/images/trace_view_gpu_utilization.PNG) + + When you select the top-right corner's "Flow events" to "fwd_bwd_correlation", + you can see the relationship between forward operator and its launched backward operator. + Note: Only the backward operator's direct launching forward operator will be connected by line, + its ancestor operators which call this operator as child will not be connected. + ![Alt text](./docs/images/trace_view_fwd_bwd_correlation.PNG) + +* Memory View + + The Pytorch profiler records all memory allocation/release events and allocator's internal state during profiling. For + each operator, the plugin aggregates all the events inside its lifespan. + + ![Alt text](./docs/images/memory_view.PNG) + + The memory kind could be selected in 'Device' selection box. For example, 'GPU0' means the following plot and tables only shows each + operator's memory usage on GPU 0, not including CPU or other GPUs. + + * Memory Curve + + Memory curve shows the memory usage trends. It helps the user get an overview on memory consumption. The 'Allocated' plot is the + total memory requested from the allocator, for example, used by tensors. The 'Reserved' plot only makes sense if the underlying + allocator make use of caching mechanism. It represents the total memory that is allocated from the operating system by the allocator. + + User can select on the memory curve plot and zoom into the selected range by pressing left mouse button and dragging on the curve. + Right click will reset the plot to the initial state. The selection will affect 'Memory Events' table and 'Memory Statistics' table + as mentioned in the following sections. + + * Memory Events + + Memory events table shows the memory allocation and release event pairs. Definition of each field in the table: + + * Operator: The immediate operator causing allocation from allocator. In pytorch, some operators such as + `aten::empty` is widely used as an API for tensor creation, in this case, we show it as ` ()`. + + * Size: The allocated memory size. + + * Allocation Time: Memory allocation time point relative to profiler start. It maybe missing from the table if the allocation event + is not included in the selected range. + + * Release Time: Memory deallocation time point relative to profiler start. It maybe missing from the table if the release event is + not included in the selected range. Notice, released memory block might still be cached by the underlying allocator. + + * Duration: The life duration of the allocated memory. It maybe missing from the table if Allocation Time or Release Time is absent. + + * Memory Statistics + + Definition of each field in the table: + + * Calls: How many times this operator is called. + + * Size Increase: The memory increase size includes all children operators. It sums up all allocation bytes and minus all the memory release bytes. + + * Self Size Increase: The memory increase size associated with the operator itself excluding that of its children. It sums up all allocation bytes and minus all the memory release bytes. + + * Allocation Count: The allocation count including all children operators. + + * Self Allocation Count: The allocation count belonging to the operator itself excluding its children. + + * Allocation Size: The allocation size including all children operators. It sums up all allocation bytes without considering the memory free. + + * Self Allocation Size: The allocation size belonging to the operator itself. It sums up all allocation bytes without considering the memory free. + + +* Distributed View + + This view will appear automatically only for DDP jobs that use nccl for communication. + There are four panels in this view: + + ![Alt text](./docs/images/distributed_view.PNG) + + * The top panel shows the information about nodes/processes/GPU hierarchy of this job. + + * The left panel in the middle is 'Computation/Communication Overview'. Definition of each legend: + * Computation: the sum of kernel time on GPU minus the overlapping time. + * Overlapping: the overlapping time of computation and communication. More overlapping represents better parallelism between computation and communication. Ideally the communication would be totally overlapped with computation. + * Communication: the total communication time minus the overlapping time. + * Other: step time minus computation and communication time. Maybe includes initialization, data loader, CPU computation, and so on. + + From this view, you can know computation-to-communication ratio of each worker and load balance between workers. For example, if the computation + overlapping time of +one worker is much larger than others, there may be a problem of loading balance or this worker may be a straggler. + + * The right panel in the middle is 'Synchronizing/Communication Overview'. Definition of each legend: + * Data Transfer Time: part in the total communication time for actual data exchanging. + * Synchronizing Time: part in the total communication time for waiting and synchronizing with other workers. + + From this view, you can know the efficiency of communication (how much ratio of total communication time is really used for exchanging data and how much is just waiting for data from other workers) + + * The 'Communication Operations Stats' summarizes the detailed statistics of all communication ops in each worker. Definition of each field: + * Calls: How many times this operator is called in this run. + * Total Size (bytes): Total data size transferred in operators of this type. + * Avg Size (bytes): Average data size transferred in each operator of this type. + * Total Latency (us): Total latency of all operators of this type. + * Avg Latency (us): Average latency of each operator of this type. + * Data Transfer Time (us): Total time actually used for data transfer in operator of this type. + * Ave Data Transfer Time (us): Average time actually used for data transfer in each operator of this type. + +* Module View + + If the torch.nn.Module information is dumped into the result Chrome tracing file by Pytorch profiler, the plugin could display the nn.Module hierarchy and summary. + + ![Alt text](./docs/images/module_view.png) + + * The top table shows each torch.nn.Module statistics information including: + * Occurrences: how many times the module is called in the training process. + * Operators: how many operators the module invokes. + * Host Total Time: The accumulated time spent on Host, including the child submodule. + * Host Self Time: The accumulated time spent on Host, not including the child submodule. + * Device Total Time: The accumulated time spent on GPU of the operators contained in the module, including the child submodule. + * Device Self Time: The accumulated time spent on GPU of the operators contained in the module, not including the child submodule. + + * The middle flamegraph shows the torch.nn.Module hierarchy information + * The bottom graph shows the main thread operators tree. + +* Lightning View + + If the Chrome tracing file is from PytorchLightning job, the plugin will show a Lightning View which is customized for Pytorch Lightning. + All the data of this view is from PytorchLightning framework. + + ![Alt text](./docs/images/lightning_view.png) + + * The top table shows the model structure. The meaning of metrics in the table is same as Module View. + * The middle flamegraph shows the model hierarchy information. + * The bottom graph shows the call tree of all hooks in PytorchLightning. + +* Diff Run View + + The diff run feature helps to compare two run by logical timeline. The key comparision operators include backward, dataloader, torch.nn.Module, optimizer. If each operator contains these sub-operators internally, the diff run could be zoom in by click the bar. + + ![Alt text](./docs/images/diff_view.png) + + * The top bar chart shows each operator type and trend comparision result. + * The middle line chart shows the delta and accumulated execution time difference against each operator type. + * The bottom table show the operators difference for the following categories: + * Host Total Duration: The accumulated time spent on Host, including this operator’s child operators. + * Host Self Duration: The accumulated time spent on Host, not including this operator’s child operators. + * Device Total Duration: The accumulated time spent on GPU, including this operator’s child operators. + * Device Self Duration: The accumulated time spent on GPU, not including this operator’s child operators. + +### PyTorch Profiler TensorBoard Plugin 0.2 Release Notes + +Known Issues: This software does not support Python 3.9.0, 3.9.1, 3.9.2. +If the TensorBoard launching reports error message "ImportError" and "circular import", +please update your Python to higher version. diff --git a/tb_plugins/profiling/tb_plugin/ci_scripts/install_env.sh b/tb_plugins/profiling/tb_plugin/ci_scripts/install_env.sh new file mode 100644 index 0000000000000000000000000000000000000000..11f588a031b7cff1d032a59d4e26b1a9e027015b --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/ci_scripts/install_env.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +set -ex + +# install cuda +#if [ "$CUDA_VERSION" = "cu101" ]; then +# wget https://developer.download.nvidia.com/compute/cuda/10.1/Prod/local_installers/cuda_10.1.243_418.87.00_linux.run +# sudo sh cuda_10.1.243_418.87.00_linux.run +#elif [ "$CUDA_VERSION" = "cu102" ]; then +# wget https://developer.download.nvidia.com/compute/cuda/10.2/Prod/local_installers/cuda_10.2.89_440.33.01_linux.run +# sudo sh cuda_10.2.89_440.33.01_linux.run +#elif [ "$CUDA_VERSION" = "cu111" ]; then +# wget https://developer.download.nvidia.com/compute/cuda/11.1.0/local_installers/cuda_11.1.0_455.23.05_linux.run +# sudo sh cuda_11.1.0_455.23.05_linux.run +#elif [ "$CUDA_VERSION" = "cu112" ]; then +# wget https://developer.download.nvidia.com/compute/cuda/11.2.0/local_installers/cuda_11.2.0_460.27.04_linux.run +# sudo sh cuda_11.2.0_460.27.04_linux.run +#fi + + + +# install pytorch +pip install numpy tensorboard typing-extensions pillow pytest +if [ "$PYTORCH_VERSION" = "nightly" ]; then + pip install --pre torch -f "https://download.pytorch.org/whl/nightly/$CUDA_VERSION/torch_nightly.html" + pip install --pre torchvision --no-deps -f "https://download.pytorch.org/whl/nightly/$CUDA_VERSION/torch_nightly.html" +elif [ "$PYTORCH_VERSION" = "1.11rc" ]; then + pip install --pre torch -f "https://download.pytorch.org/whl/test/$CUDA_VERSION/torch_test.html" + pip install --pre torchvision --no-deps -f "https://download.pytorch.org/whl/test/$CUDA_VERSION/torch_test.html" +elif [ "$PYTORCH_VERSION" = "stable" ]; then + pip install torch torchvision +fi + +python -c "import torch; print(torch.__version__, torch.version.git_version); from torch.autograd import kineto_available; print(kineto_available())" diff --git a/tb_plugins/profiling/tb_plugin/docs/gpu_utilization.md b/tb_plugins/profiling/tb_plugin/docs/gpu_utilization.md new file mode 100644 index 0000000000000000000000000000000000000000..c4f45b880c71fd25f3d2d727408390e4751b72da --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/docs/gpu_utilization.md @@ -0,0 +1,22 @@ +* GPU Utilization: GPU busy time / all steps time. The higher, the better. All steps time is the total time of all profiler steps(or called as iterations). + GPU busy time is the time during “all steps time” when is at least one GPU kernel running on this GPU. + However, this high-level utilization metric is coarse. It can’t tell how many SMs(Stream Multiprocessors) are in use. + For example, a kernel with a single thread running continuously will get 100% GPU utilization. + +* Est. SM Efficiency: Estimated Stream Multiprocessor Efficiency. The higher, the better. This metric of a kernel, SM_Eff_K = min(blocks of this kernel / SM number of this GPU, 100%). + This overall number is the sum of all kernels' SM_Eff_K weighted by kernel's execution duration, divided by “all steps time”. + It shows GPU Stream Multiprocessors’ utilization. + Although it is finer grained than above “GPU Utilization”, it still can’t tell the whole story. + For example, a kernel with only one thread per block can’t fully utilize each SM. + +* Est. Achieved Occupancy: For most cases such as memory bandwidth bound kernels, a higher value often translates to better performance, especially when the initial value is very low. [Reference](http://developer.download.nvidia.com/GTC/PDF/GTC2012/PresentationPDF/S0514-GTC2012-GPU-Performance-Analysis.pdf). The definition of occupancy is [here](https://docs.nvidia.com/gameworks/content/developertools/desktop/analysis/report/cudaexperiments/kernellevel/achievedoccupancy.htm). + Occupancy is the ratio of active warps on an SM to the maximum number of + active warps supported by the SM. The theoretical occupancy of a kernel is upper limit occupancy of this kernel, limited by multiple + factors such as kernel shape, kernel used resource, and the GPU compute capability. + Est. Achieved Occupancy of a kernel, OCC_K = min(threads of the kernel / SM number / max threads per SM, theoretical occupancy of the kernel). + This overall number is the weighted sum of all kernels OCC_K using kernel's execution duration as weight. It shows fine-grained low-level GPU utilization. + + * Kernel Time using Tensor Cores: Total GPU Time for Tensor Core kernels / Total GPU Time for all kernels. Higher is better. + Tensor Cores are mixed precision floating point operations available for Volta GPUs (Titan V) and beyond. + The cuDNN and cuBLAS libraries contain several Tensor Cores enabled GPU kernels for most Convolution and GEMM operations. + This number shows Tensor Cores usage time ratio among all kernels on a GPU. diff --git a/tb_plugins/profiling/tb_plugin/docs/images/control_panel.PNG b/tb_plugins/profiling/tb_plugin/docs/images/control_panel.PNG new file mode 100644 index 0000000000000000000000000000000000000000..31bd12d9ce7c0d5efa17056ea870de5e835a5031 Binary files /dev/null and b/tb_plugins/profiling/tb_plugin/docs/images/control_panel.PNG differ diff --git a/tb_plugins/profiling/tb_plugin/docs/images/diff_view.png b/tb_plugins/profiling/tb_plugin/docs/images/diff_view.png new file mode 100644 index 0000000000000000000000000000000000000000..7747c29c076926289efe1c38025d98c225e6351a Binary files /dev/null and b/tb_plugins/profiling/tb_plugin/docs/images/diff_view.png differ diff --git a/tb_plugins/profiling/tb_plugin/docs/images/distributed_view.PNG b/tb_plugins/profiling/tb_plugin/docs/images/distributed_view.PNG new file mode 100644 index 0000000000000000000000000000000000000000..95bf38565b57be649b2174eefaa37586cf29fd4a Binary files /dev/null and b/tb_plugins/profiling/tb_plugin/docs/images/distributed_view.PNG differ diff --git a/tb_plugins/profiling/tb_plugin/docs/images/kernel_view.PNG b/tb_plugins/profiling/tb_plugin/docs/images/kernel_view.PNG new file mode 100644 index 0000000000000000000000000000000000000000..53d0c57ae5ed36130db58d4e93fd2392a4ed8760 Binary files /dev/null and b/tb_plugins/profiling/tb_plugin/docs/images/kernel_view.PNG differ diff --git a/tb_plugins/profiling/tb_plugin/docs/images/kernel_view_group_by_properties_and_op.PNG b/tb_plugins/profiling/tb_plugin/docs/images/kernel_view_group_by_properties_and_op.PNG new file mode 100644 index 0000000000000000000000000000000000000000..5001f28c75a1257689758a15553c9594cd86edd7 Binary files /dev/null and b/tb_plugins/profiling/tb_plugin/docs/images/kernel_view_group_by_properties_and_op.PNG differ diff --git a/tb_plugins/profiling/tb_plugin/docs/images/lightning_view.png b/tb_plugins/profiling/tb_plugin/docs/images/lightning_view.png new file mode 100644 index 0000000000000000000000000000000000000000..03a5004f16bf188755640b9de2fce5545af7940f Binary files /dev/null and b/tb_plugins/profiling/tb_plugin/docs/images/lightning_view.png differ diff --git a/tb_plugins/profiling/tb_plugin/docs/images/memory_view.PNG b/tb_plugins/profiling/tb_plugin/docs/images/memory_view.PNG new file mode 100644 index 0000000000000000000000000000000000000000..1b1446dfac2b0b3f9ac9ba0d2033659d6a248ca6 Binary files /dev/null and b/tb_plugins/profiling/tb_plugin/docs/images/memory_view.PNG differ diff --git a/tb_plugins/profiling/tb_plugin/docs/images/module_view.png b/tb_plugins/profiling/tb_plugin/docs/images/module_view.png new file mode 100644 index 0000000000000000000000000000000000000000..610cf202a3bc6154c79085fe9f6ab12c79c2151a Binary files /dev/null and b/tb_plugins/profiling/tb_plugin/docs/images/module_view.png differ diff --git a/tb_plugins/profiling/tb_plugin/docs/images/operator_view.PNG b/tb_plugins/profiling/tb_plugin/docs/images/operator_view.PNG new file mode 100644 index 0000000000000000000000000000000000000000..351c69883aed74573e53869b66fa21d2ed285c62 Binary files /dev/null and b/tb_plugins/profiling/tb_plugin/docs/images/operator_view.PNG differ diff --git a/tb_plugins/profiling/tb_plugin/docs/images/operator_view_group_by_inputshape.PNG b/tb_plugins/profiling/tb_plugin/docs/images/operator_view_group_by_inputshape.PNG new file mode 100644 index 0000000000000000000000000000000000000000..cccdd1cefb37ab7e113cb5be2ed3ec7de3ffedaf Binary files /dev/null and b/tb_plugins/profiling/tb_plugin/docs/images/operator_view_group_by_inputshape.PNG differ diff --git a/tb_plugins/profiling/tb_plugin/docs/images/overall_view.PNG b/tb_plugins/profiling/tb_plugin/docs/images/overall_view.PNG new file mode 100644 index 0000000000000000000000000000000000000000..916be90c0674f8fba5ac36d0c6e07cfd258f49e5 Binary files /dev/null and b/tb_plugins/profiling/tb_plugin/docs/images/overall_view.PNG differ diff --git a/tb_plugins/profiling/tb_plugin/docs/images/time_breakdown_priority.PNG b/tb_plugins/profiling/tb_plugin/docs/images/time_breakdown_priority.PNG new file mode 100644 index 0000000000000000000000000000000000000000..c8574772786b2e38f52793c51e96fee8e9299aa7 Binary files /dev/null and b/tb_plugins/profiling/tb_plugin/docs/images/time_breakdown_priority.PNG differ diff --git a/tb_plugins/profiling/tb_plugin/docs/images/trace_view.PNG b/tb_plugins/profiling/tb_plugin/docs/images/trace_view.PNG new file mode 100644 index 0000000000000000000000000000000000000000..aa1ced94750c0c449e9136c513b39638d4d520aa Binary files /dev/null and b/tb_plugins/profiling/tb_plugin/docs/images/trace_view.PNG differ diff --git a/tb_plugins/profiling/tb_plugin/docs/images/trace_view_fwd_bwd_correlation.PNG b/tb_plugins/profiling/tb_plugin/docs/images/trace_view_fwd_bwd_correlation.PNG new file mode 100644 index 0000000000000000000000000000000000000000..c6536ac18d64694299e5389738006e408ca4931e Binary files /dev/null and b/tb_plugins/profiling/tb_plugin/docs/images/trace_view_fwd_bwd_correlation.PNG differ diff --git a/tb_plugins/profiling/tb_plugin/docs/images/trace_view_gpu_utilization.PNG b/tb_plugins/profiling/tb_plugin/docs/images/trace_view_gpu_utilization.PNG new file mode 100644 index 0000000000000000000000000000000000000000..4c8bbb0f54ebe6589b40d500cb33f72bca13b64c Binary files /dev/null and b/tb_plugins/profiling/tb_plugin/docs/images/trace_view_gpu_utilization.PNG differ diff --git a/tb_plugins/profiling/tb_plugin/docs/images/trace_view_launch.PNG b/tb_plugins/profiling/tb_plugin/docs/images/trace_view_launch.PNG new file mode 100644 index 0000000000000000000000000000000000000000..ec37f3a84ea009f26fd95f1a96161f55cc11a41f Binary files /dev/null and b/tb_plugins/profiling/tb_plugin/docs/images/trace_view_launch.PNG differ diff --git a/tb_plugins/profiling/tb_plugin/docs/images/trace_view_one_step.PNG b/tb_plugins/profiling/tb_plugin/docs/images/trace_view_one_step.PNG new file mode 100644 index 0000000000000000000000000000000000000000..49690e3f594bf1ae6be1ab3f4079a43c863b74a5 Binary files /dev/null and b/tb_plugins/profiling/tb_plugin/docs/images/trace_view_one_step.PNG differ diff --git a/tb_plugins/profiling/tb_plugin/docs/images/vscode_stack.PNG b/tb_plugins/profiling/tb_plugin/docs/images/vscode_stack.PNG new file mode 100644 index 0000000000000000000000000000000000000000..afb99f06937642b207cce36db715be9f9ec78334 Binary files /dev/null and b/tb_plugins/profiling/tb_plugin/docs/images/vscode_stack.PNG differ diff --git a/tb_plugins/profiling/tb_plugin/examples/datapipe_example.py b/tb_plugins/profiling/tb_plugin/examples/datapipe_example.py new file mode 100644 index 0000000000000000000000000000000000000000..a6eac79b9d993decb3ecea7aa90b1b1fecf8d228 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/examples/datapipe_example.py @@ -0,0 +1,50 @@ +import torch +import torch.nn as nn +import torch.optim +from torch.utils.data.dataloader_experimental import DataLoader2 + +from torchvision import transforms as T +import torchvision.prototype.datasets as pdatasets +import torchvision.prototype.models as models +from torchvision.prototype.datasets._builtin import Cifar10 + + +if __name__ == "__main__": + model = models.resnet50(models.ResNet50_Weights.ImageNet1K_V1) + trainset = Cifar10().to_datapipe(root='./data', decoder=pdatasets.decoder.raw) + transform = T.Compose([T.Resize(256), T.CenterCrop(224)]) + trainset = trainset.map(transform, input_col="image") + trainset = trainset.map(fn=T.functional.convert_image_dtype, input_col="image") + dl = DataLoader2(trainset, batch_size=64) + criterion = nn.CrossEntropyLoss().cuda(0) + optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9) + device = torch.device("cuda:0") + model.to(device=device).train() + + with torch.profiler.profile( + activities=[ + torch.profiler.ProfilerActivity.CPU, + torch.profiler.ProfilerActivity.CUDA], + schedule=torch.profiler.schedule( + wait=1, + warmup=1, + active=2), + on_trace_ready=torch.profiler.tensorboard_trace_handler('./result', worker_name='datapipe0'), + record_shapes=True, + profile_memory=True, # This will take 1 to 2 minutes. Setting it to False could greatly speedup. + with_stack=True + ) as p: + for step, data in enumerate(dl, 0): + print("step:{}".format(step)) + input_tensors = data['image'] + label_tensors = data['label'] + inputs, labels = input_tensors.to(device=device), label_tensors.to(device=device) + outputs = model(inputs) + loss = criterion(outputs, labels) + optimizer.zero_grad() + loss.backward() + optimizer.step() + if step + 1 >= 4: + break + p.step() + print("done") diff --git a/tb_plugins/profiling/tb_plugin/examples/resnet50_autograd_api.py b/tb_plugins/profiling/tb_plugin/examples/resnet50_autograd_api.py new file mode 100644 index 0000000000000000000000000000000000000000..9ff5d89aaaab1dc2802c9be16e7f75a37bc2f803 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/examples/resnet50_autograd_api.py @@ -0,0 +1,46 @@ +import os +import torch +import torch.nn as nn +import torch.backends.cudnn as cudnn +import torch.optim +import torch.utils.data +import torchvision +import torchvision.transforms as T +import torchvision.models as models + +from torch.autograd.profiler import profile + +model = models.resnet50(pretrained=True) +model.cuda() +cudnn.benchmark = True + +transform = T.Compose([T.Resize(256), T.CenterCrop(224), T.ToTensor()]) +trainset = torchvision.datasets.CIFAR10(root='./data', train=True, + download=True, transform=transform) +trainloader = torch.utils.data.DataLoader(trainset, batch_size=32, + shuffle=True, num_workers=0) + +criterion = nn.CrossEntropyLoss().cuda() +optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9) +device = torch.device("cuda:0") +model.train() + +with profile(use_cuda=True, use_kineto=True, record_shapes=True) as p: + for step, data in enumerate(trainloader, 0): + print("step:{}".format(step)) + inputs, labels = data[0].to(device=device), data[1].to(device=device) + + outputs = model(inputs) + loss = criterion(outputs, labels) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + if step >= 5: + break + +try: + os.mkdir("result") +except Exception: + pass +p.export_chrome_trace("./result/worker0.pt.trace.json") diff --git a/tb_plugins/profiling/tb_plugin/examples/resnet50_ddp_profiler.py b/tb_plugins/profiling/tb_plugin/examples/resnet50_ddp_profiler.py new file mode 100644 index 0000000000000000000000000000000000000000..0ebcfe989e8261026da74cf0d83189b715182fba --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/examples/resnet50_ddp_profiler.py @@ -0,0 +1,95 @@ +import os + +import torch +import torch.backends.cudnn as cudnn +import torch.distributed as dist +import torch.multiprocessing as mp +import torch.nn as nn +import torch.optim +import torch.profiler +import torch.utils.data +import torchvision +import torchvision.models as models +import torchvision.transforms as T +from torch.nn.parallel import DistributedDataParallel as DDP + + +def example(rank, use_gpu=True): + if use_gpu: + torch.cuda.set_device(rank) + model = models.resnet50(pretrained=True).to(rank) + model.cuda() + cudnn.benchmark = True + model = DDP(model, device_ids=[rank]) + else: + model = models.resnet50(pretrained=True) + model = DDP(model) + + # Use gradient compression to reduce communication + # model.register_comm_hook(None, default.fp16_compress_hook) + # or + # state = powerSGD_hook.PowerSGDState(process_group=None,matrix_approximation_rank=1,start_powerSGD_iter=2) + # model.register_comm_hook(state, powerSGD_hook.powerSGD_hook) + + transform = T.Compose([T.Resize(256), T.CenterCrop(224), T.ToTensor()]) + trainset = torchvision.datasets.CIFAR10(root='./data', train=True, + download=True, transform=transform) + train_sampler = torch.utils.data.distributed.DistributedSampler(trainset) + trainloader = torch.utils.data.DataLoader(trainset, batch_size=32, sampler=train_sampler, + shuffle=False, num_workers=4) + + if use_gpu: + criterion = nn.CrossEntropyLoss().to(rank) + else: + criterion = nn.CrossEntropyLoss() + optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9) + model.train() + + with torch.profiler.profile( + activities=[ + torch.profiler.ProfilerActivity.CPU, + torch.profiler.ProfilerActivity.CUDA], + schedule=torch.profiler.schedule( + wait=2, + warmup=2, + active=5), + with_stack=False, + on_trace_ready=torch.profiler.tensorboard_trace_handler('./result'), + record_shapes=True + ) as p: + for step, data in enumerate(trainloader, 0): + print("step:{}".format(step)) + if use_gpu: + inputs, labels = data[0].to(rank), data[1].to(rank) + else: + inputs, labels = data[0], data[1] + outputs = model(inputs) + loss = criterion(outputs, labels) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + p.step() + if step + 1 >= 10: + break + + +def init_process(rank, size, fn, backend='nccl'): + """ Initialize the distributed environment. """ + os.environ['MASTER_ADDR'] = '127.0.0.1' + os.environ['MASTER_PORT'] = '29500' + dist.init_process_group(backend, rank=rank, world_size=size) + fn(rank, size) + + +if __name__ == "__main__": + size = 4 + processes = [] + mp.set_start_method("spawn") + for rank in range(size): + p = mp.Process(target=init_process, args=(rank, size, example)) + p.start() + processes.append(p) + + for p in processes: + p.join() diff --git a/tb_plugins/profiling/tb_plugin/examples/resnet50_profiler_api.py b/tb_plugins/profiling/tb_plugin/examples/resnet50_profiler_api.py new file mode 100644 index 0000000000000000000000000000000000000000..cdfa14aa77e1b82101a2083acff86c9e8de2890d --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/examples/resnet50_profiler_api.py @@ -0,0 +1,52 @@ +import torch +import torch.nn as nn +import torch.backends.cudnn as cudnn +import torch.optim +import torch.utils.data +import torchvision +import torchvision.transforms as T +import torchvision.models as models + +import torch.profiler + +model = models.resnet50(pretrained=True) +model.cuda() +cudnn.benchmark = True + +transform = T.Compose([T.Resize(256), T.CenterCrop(224), T.ToTensor()]) +trainset = torchvision.datasets.CIFAR10(root='./data', train=True, + download=True, transform=transform) +trainloader = torch.utils.data.DataLoader(trainset, batch_size=32, + shuffle=True, num_workers=4) + +criterion = nn.CrossEntropyLoss().cuda() +optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9) +device = torch.device("cuda:0") +model.train() + +with torch.profiler.profile( + activities=[ + torch.profiler.ProfilerActivity.CPU, + torch.profiler.ProfilerActivity.CUDA], + schedule=torch.profiler.schedule( + wait=1, + warmup=1, + active=2), + on_trace_ready=torch.profiler.tensorboard_trace_handler('./result', worker_name='worker0'), + record_shapes=True, + profile_memory=True, # This will take 1 to 2 minutes. Setting it to False could greatly speedup. + with_stack=True +) as p: + for step, data in enumerate(trainloader, 0): + print("step:{}".format(step)) + inputs, labels = data[0].to(device=device), data[1].to(device=device) + + outputs = model(inputs) + loss = criterion(outputs, labels) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + if step + 1 >= 4: + break + p.step() diff --git a/tb_plugins/profiling/tb_plugin/fe/.gitignore b/tb_plugins/profiling/tb_plugin/fe/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..e6a92696825082a8ff08815b553822cf7a4c4c8e --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/.gitignore @@ -0,0 +1,3 @@ +/dist +/node_modules +*.log diff --git a/tb_plugins/profiling/tb_plugin/fe/README.md b/tb_plugins/profiling/tb_plugin/fe/README.md new file mode 100644 index 0000000000000000000000000000000000000000..aa9cf2587780da8cd911a88c10f763516fc8867f --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/README.md @@ -0,0 +1,18 @@ +# Pytorch Profiler + +### Install & Build + +1. install [Node.js](https://nodejs.org/) + * ```bash + curl -fsSL https://deb.nodesource.com/setup_16.x | sudo -E bash - + sudo apt-get install -y nodejs``` +2. install [Yarn](https://yarnpkg.com/) + * ```bash + curl -sS https://dl.yarnpkg.com/debian/pubkey.gpg | sudo apt-key add - + echo "deb https://dl.yarnpkg.com/debian/ stable main" | sudo tee /etc/apt/sources.list.d/yarn.list + sudo apt update && sudo apt install yarn + ``` +3. shell `yarn` to prepare JS dependency +4. shell `yarn build:copy` +5. Go to `tb_plugin` folder and install the package using `python setup.py develop` +6. Launch tensorboard diff --git a/tb_plugins/profiling/tb_plugin/fe/index.html b/tb_plugins/profiling/tb_plugin/fe/index.html new file mode 100644 index 0000000000000000000000000000000000000000..a58ddc088336085b78597616844ebb131f49ad51 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/index.html @@ -0,0 +1,10 @@ + + + + + + + +
+ + diff --git a/tb_plugins/profiling/tb_plugin/fe/package.json b/tb_plugins/profiling/tb_plugin/fe/package.json new file mode 100644 index 0000000000000000000000000000000000000000..7beeb696c42d7c069cd62662c5e7c8182e9725a1 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/package.json @@ -0,0 +1,44 @@ +{ + "name": "fe", + "private": "true", + "version": "1.0.0", + "main": "index.js", + "scripts": { + "build": "cross-env NODE_ENV=production webpack", + "build:copy": "yarn build && node ./update-static.js", + "build:dev": "webpack", + "dev": "webpack serve", + "prettier": "prettier --config ./prettier.json --write ./src/**/*.{ts,tsx} ./*.js" + }, + "dependencies": { + "@babel/runtime": "^7.13.10", + "@material-ui/core": "^4.11.3", + "@material-ui/icons": "^4.11.2", + "antd": "^4.17.0", + "clsx": "^1.1.1", + "portable-fetch": "^3.0.0", + "react": "^16.13.1", + "react-dom": "^16.13.1", + "react-flame-graph": "^1.4.0" + }, + "devDependencies": { + "@types/react": "^16.9.51", + "@types/react-dom": "^16.9.8", + "cross-env": "^7.0.2", + "css-loader": "^5.2.4", + "html-webpack-plugin": "^5.3.1", + "inline-chunk-html-plugin": "^1.1.1", + "prettier": "^2.1.2", + "style-loader": "^2.0.0", + "ts-loader": "^8.0.18", + "typescript": "^4.0.3", + "webpack": "^5.28.0", + "webpack-cli": "^4.5.0", + "webpack-dev-server": "^4.7.4" + }, + "resolutions": { + "portable-fetch/**/node-fetch": "^2.6.1", + "webpack/**/browserslist": "^4.16.5", + "postcss/**/nanoid": "^3.1.31" + } +} diff --git a/tb_plugins/profiling/tb_plugin/fe/prettier.json b/tb_plugins/profiling/tb_plugin/fe/prettier.json new file mode 100644 index 0000000000000000000000000000000000000000..6049640793f6907bbd38c7065360df0ac24d64d4 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/prettier.json @@ -0,0 +1,12 @@ +{ + "parser": "typescript", + "semi": false, + "singleQuote": true, + "jsxSingleQuote": false, + "bracketSpacing": true, + "tabWidth": 2, + "useTabs": false, + "trailingComma": "none", + "proseWrap": "always", + "endOfLine": "lf" +} diff --git a/tb_plugins/profiling/tb_plugin/fe/scripts/add_header.py b/tb_plugins/profiling/tb_plugin/fe/scripts/add_header.py new file mode 100644 index 0000000000000000000000000000000000000000..a36b606377060a4e6a584aa3e24ffc71c30dfb90 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/scripts/add_header.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python +import glob +import os +import sys + +HEADER = '''/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +''' + + +def add_header(file): + with open(file, 'r') as f: + contents = f.readlines() + + # do nothing if there is already header + if contents and contents[0].startswith('/*-'): + return + + with open(file, 'w') as out: + out.write(HEADER) + out.writelines(contents) + + +if __name__ == '__main__': + dir = sys.argv[1] + if not os.path.isdir(dir): + raise ValueError('{} is not a directory'.format(dir)) + + for file in glob.glob(dir + '/*.ts'): + add_header(file) diff --git a/tb_plugins/profiling/tb_plugin/fe/scripts/build.sh b/tb_plugins/profiling/tb_plugin/fe/scripts/build.sh new file mode 100644 index 0000000000000000000000000000000000000000..014a26e26c3b58421b878c886fd1899cae2758b3 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/scripts/build.sh @@ -0,0 +1,13 @@ +#!/bin/bash +set -e + +current_dir="$( cd "$( dirname "$0" )" && pwd )" +FE_ROOT="$(dirname "$current_dir")" +cd $FE_ROOT/ + +java -jar $FE_ROOT/swagger-codegen-cli.jar generate -i $FE_ROOT/src/api/openapi.yaml -l typescript-fetch -o $FE_ROOT/src/api/generated/ --additional-properties modelPropertyNaming=original +rm $FE_ROOT/src/api/generated/api_test.spec.ts +yarn prettier --end-of-line lf +python $FE_ROOT/scripts/add_header.py $FE_ROOT/src/api/generated/ + +yarn build:copy diff --git a/tb_plugins/profiling/tb_plugin/fe/scripts/setup.sh b/tb_plugins/profiling/tb_plugin/fe/scripts/setup.sh new file mode 100644 index 0000000000000000000000000000000000000000..fb2680d9816139854c58b87c6293a8c11ce685f4 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/scripts/setup.sh @@ -0,0 +1,25 @@ +#!/bin/bash +set -e + +current_dir="$( cd "$( dirname "$0" )" && pwd )" +FE_ROOT="$(dirname "$current_dir")" + +# # install nodejs +if ! command -v node &> /dev/null +then + curl -fsSL https://deb.nodesource.com/setup_16.x | sudo -E bash - + sudo apt-get install -y nodejs +fi + +# install yarn +if ! command -v yarn &> /dev/null +then + curl -sS https://dl.yarnpkg.com/debian/pubkey.gpg | sudo apt-key add - + echo "deb https://dl.yarnpkg.com/debian/ stable main" | sudo tee /etc/apt/sources.list.d/yarn.list + sudo apt update && sudo apt install yarn +fi + +# download swagger-codegen-cli +if [[ ! -f "$FE_ROOT/swagger-codegen-cli.jar" ]]; then + wget https://repo1.maven.org/maven2/io/swagger/codegen/v3/swagger-codegen-cli/3.0.25/swagger-codegen-cli-3.0.25.jar -O swagger-codegen-cli.jar +fi diff --git a/tb_plugins/profiling/tb_plugin/fe/src/api/README.md b/tb_plugins/profiling/tb_plugin/fe/src/api/README.md new file mode 100644 index 0000000000000000000000000000000000000000..06208c419e1c72c4d49e3dc06f8304d4198b27c2 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/api/README.md @@ -0,0 +1,13 @@ +# How to generate the api.ts + +## Prerequisites +1. install java +2. run command +```bash + cd fe + wget https://repo1.maven.org/maven2/io/swagger/codegen/v3/swagger-codegen-cli/3.0.25/swagger-codegen-cli-3.0.25.jar -O swagger-codegen-cli.jar + java -jar swagger-codegen-cli.jar generate -i ./src/api/openapi.yaml -l typescript-fetch -o ./src/api/generated/ --additional-properties modelPropertyNaming=original + rm ./src/api/generated/api_test.spec.ts + yarn prettier --end-of-line lf + python ./scripts/add_header.py ./src/api/generated/ +``` diff --git a/tb_plugins/profiling/tb_plugin/fe/src/api/generated/api.ts b/tb_plugins/profiling/tb_plugin/fe/src/api/generated/api.ts new file mode 100644 index 0000000000000000000000000000000000000000..b33fe1b6a84730b728177abbbe4ab7e3e489b638 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/api/generated/api.ts @@ -0,0 +1,4535 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +/// +// tslint:disable +/** + * Pytorch profile API + * No description provided (generated by Swagger Codegen https://github.com/swagger-api/swagger-codegen) + * + * OpenAPI spec version: 1.0.0 + * + * + * NOTE: This file is auto generated by the swagger code generator program. + * https://github.com/swagger-api/swagger-codegen.git + * Do not edit the file manually. + */ + +import * as url from 'url' +import * as portableFetch from 'portable-fetch' +import { Configuration } from './configuration' + +const BASE_PATH = '.'.replace(/\/+$/, '') + +/** + * + * @export + */ +export const COLLECTION_FORMATS = { + csv: ',', + ssv: ' ', + tsv: '\t', + pipes: '|' +} + +/** + * + * @export + * @interface FetchAPI + */ +export interface FetchAPI { + (url: string, init?: any): Promise +} + +/** + * + * @export + * @interface FetchArgs + */ +export interface FetchArgs { + url: string + options: any +} + +/** + * + * @export + * @class BaseAPI + */ +export class BaseAPI { + protected configuration: Configuration + + constructor( + configuration?: Configuration, + protected basePath: string = BASE_PATH, + protected fetch: FetchAPI = portableFetch + ) { + if (configuration) { + this.configuration = configuration + this.basePath = configuration.basePath || this.basePath + } + } +} + +/** + * + * @export + * @class RequiredError + * @extends {Error} + */ +export class RequiredError extends Error { + name: 'RequiredError' + constructor(public field: string, msg?: string) { + super(msg) + } +} + +/** + * + * @export + * @interface CallStackTableData + */ +export interface CallStackTableData extends Array {} +/** + * + * @export + * @interface CallStackTableDataInner + */ +export interface CallStackTableDataInner { + /** + * + * @type {string} + * @memberof CallStackTableDataInner + */ + name: string + /** + * + * @type {string} + * @memberof CallStackTableDataInner + */ + input_shape?: string + /** + * + * @type {number} + * @memberof CallStackTableDataInner + */ + calls: number + /** + * + * @type {number} + * @memberof CallStackTableDataInner + */ + device_self_duration?: number + /** + * + * @type {number} + * @memberof CallStackTableDataInner + */ + device_total_duration?: number + /** + * + * @type {number} + * @memberof CallStackTableDataInner + */ + host_self_duration: number + /** + * + * @type {number} + * @memberof CallStackTableDataInner + */ + host_total_duration: number + /** + * + * @type {string} + * @memberof CallStackTableDataInner + */ + call_stack?: string + /** + * + * @type {string} + * @memberof CallStackTableDataInner + */ + tc_eligible?: string + /** + * + * @type {number} + * @memberof CallStackTableDataInner + */ + tc_self_ratio?: number + /** + * + * @type {number} + * @memberof CallStackTableDataInner + */ + tc_total_ratio?: number +} +/** + * + * @export + * @interface DiffNode + */ +export interface DiffNode { + /** + * + * @type {OpStats} + * @memberof DiffNode + */ + left: OpStats + /** + * + * @type {OpStats} + * @memberof DiffNode + */ + right: OpStats + /** + * + * @type {string} + * @memberof DiffNode + */ + path: string + /** + * + * @type {Array} + * @memberof DiffNode + */ + children: Array +} +/** + * + * @export + * @interface DistributedGraph + */ +export interface DistributedGraph { + /** + * + * @type {DistributedGraphMetadata} + * @memberof DistributedGraph + */ + metadata: DistributedGraphMetadata + /** + * + * @type {any} + * @memberof DistributedGraph + */ + data: any +} +/** + * + * @export + * @interface DistributedGraphMetadata + */ +export interface DistributedGraphMetadata { + /** + * + * @type {string} + * @memberof DistributedGraphMetadata + */ + title: string + /** + * + * @type {Array} + * @memberof DistributedGraphMetadata + */ + legends: Array + /** + * + * @type {string} + * @memberof DistributedGraphMetadata + */ + units: string +} +/** + * + * @export + * @interface Environment + */ +export interface Environment { + /** + * + * @type {string} + * @memberof Environment + */ + title: string + /** + * + * @type {string} + * @memberof Environment + */ + value: string +} +/** + * + * @export + * @interface GpuInfo + */ +export interface GpuInfo { + /** + * + * @type {GpuInfoMetadata} + * @memberof GpuInfo + */ + metadata: GpuInfoMetadata + /** + * + * @type {any} + * @memberof GpuInfo + */ + data: any +} +/** + * + * @export + * @interface GpuInfoMetadata + */ +export interface GpuInfoMetadata { + /** + * + * @type {string} + * @memberof GpuInfoMetadata + */ + title: string +} +/** + * + * @export + * @interface GpuMetric + */ +export interface GpuMetric { + /** + * + * @type {string} + * @memberof GpuMetric + */ + title: string + /** + * + * @type {string} + * @memberof GpuMetric + */ + value: string +} +/** + * + * @export + * @interface GpuMetrics + */ +export interface GpuMetrics { + /** + * + * @type {Array} + * @memberof GpuMetrics + */ + data: Array + /** + * + * @type {string} + * @memberof GpuMetrics + */ + tooltip: string +} +/** + * + * @export + * @interface Graph + */ +export interface Graph { + /** + * + * @type {string} + * @memberof Graph + */ + title?: string + /** + * + * @type {Array} + * @memberof Graph + */ + columns: Array + /** + * + * @type {Array>} + * @memberof Graph + */ + rows: Array> +} +/** + * + * @export + * @interface GraphColumn + */ +export interface GraphColumn { + /** + * + * @type {string} + * @memberof GraphColumn + */ + type: string + /** + * + * @type {string} + * @memberof GraphColumn + */ + name: string + /** + * + * @type {string} + * @memberof GraphColumn + */ + role?: string + /** + * + * @type {GraphColumnP} + * @memberof GraphColumn + */ + p?: GraphColumnP +} +/** + * + * @export + * @interface GraphColumnP + */ +export interface GraphColumnP { + /** + * + * @type {boolean} + * @memberof GraphColumnP + */ + html?: boolean +} +/** + * + * @export + * @interface InlineResponse200 + */ +export interface InlineResponse200 { + /** + * + * @type {TableMetadata} + * @memberof InlineResponse200 + */ + metadata: TableMetadata + /** + * + * @type {OperationTableData} + * @memberof InlineResponse200 + */ + data: OperationTableData +} +/** + * + * @export + * @interface InlineResponse2001 + */ +export interface InlineResponse2001 { + /** + * + * @type {TableMetadata} + * @memberof InlineResponse2001 + */ + metadata: TableMetadata + /** + * + * @type {CallStackTableData} + * @memberof InlineResponse2001 + */ + data: CallStackTableData +} +/** + * + * @export + * @interface InlineResponse2002 + */ +export interface InlineResponse2002 { + /** + * + * @type {GpuInfoMetadata} + * @memberof InlineResponse2002 + */ + metadata: GpuInfoMetadata + /** + * + * @type {any} + * @memberof InlineResponse2002 + */ + data: any +} +/** + * + * @export + * @interface KernelGraph + */ +export interface KernelGraph { + /** + * + * @type {Graph} + * @memberof KernelGraph + */ + total: Graph +} +/** + * + * @export + * @interface KeyedColumn + */ +export interface KeyedColumn { + /** + * + * @type {string} + * @memberof KeyedColumn + */ + type: string + /** + * + * @type {string} + * @memberof KeyedColumn + */ + name: string + /** + * + * @type {string} + * @memberof KeyedColumn + */ + key: string +} +/** + * + * @export + * @interface MemoryCurveData + */ +export interface MemoryCurveData { + /** + * + * @type {MemoryCurveDataMetadata} + * @memberof MemoryCurveData + */ + metadata: MemoryCurveDataMetadata + /** + * + * @type {Array} + * @memberof MemoryCurveData + */ + columns: Array + /** + * + * @type {any} + * @memberof MemoryCurveData + */ + rows: any +} +/** + * + * @export + * @interface MemoryCurveDataMetadata + */ +export interface MemoryCurveDataMetadata { + /** + * + * @type {string} + * @memberof MemoryCurveDataMetadata + */ + default_device: string + /** + * + * @type {Array} + * @memberof MemoryCurveDataMetadata + */ + devices: Array + /** + * + * @type {any} + * @memberof MemoryCurveDataMetadata + */ + peaks: any + /** + * + * @type {any} + * @memberof MemoryCurveDataMetadata + */ + totals: any + /** + * + * @type {number} + * @memberof MemoryCurveDataMetadata + */ + first_ts: number + /** + * + * @type {string} + * @memberof MemoryCurveDataMetadata + */ + time_metric: string + /** + * + * @type {string} + * @memberof MemoryCurveDataMetadata + */ + memory_metric: string + /** + * + * @type {number} + * @memberof MemoryCurveDataMetadata + */ + time_factor: number + /** + * + * @type {number} + * @memberof MemoryCurveDataMetadata + */ + memory_factor: number +} +/** + * + * @export + * @interface MemoryEventsData + */ +export interface MemoryEventsData { + /** + * + * @type {MemoryEventsTableMetadata} + * @memberof MemoryEventsData + */ + metadata: MemoryEventsTableMetadata + /** + * + * @type {Array} + * @memberof MemoryEventsData + */ + columns: Array + /** + * + * @type {any} + * @memberof MemoryEventsData + */ + rows: any +} +/** + * + * @export + * @interface MemoryEventsTableMetadata + */ +export interface MemoryEventsTableMetadata { + /** + * + * @type {string} + * @memberof MemoryEventsTableMetadata + */ + title: string + /** + * + * @type {string} + * @memberof MemoryEventsTableMetadata + */ + default_device: string + /** + * + * @type {string} + * @memberof MemoryEventsTableMetadata + */ + search?: string + /** + * + * @type {string} + * @memberof MemoryEventsTableMetadata + */ + sort?: string +} +/** + * + * @export + * @interface MemoryStatsData + */ +export interface MemoryStatsData { + /** + * + * @type {MemoryStatsTableMetadata} + * @memberof MemoryStatsData + */ + metadata: MemoryStatsTableMetadata + /** + * + * @type {Array} + * @memberof MemoryStatsData + */ + columns: Array + /** + * + * @type {any} + * @memberof MemoryStatsData + */ + rows: any +} +/** + * + * @export + * @interface MemoryStatsTableMetadata + */ +export interface MemoryStatsTableMetadata { + /** + * + * @type {string} + * @memberof MemoryStatsTableMetadata + */ + title: string + /** + * + * @type {string} + * @memberof MemoryStatsTableMetadata + */ + default_device: string + /** + * + * @type {string} + * @memberof MemoryStatsTableMetadata + */ + search: string + /** + * + * @type {string} + * @memberof MemoryStatsTableMetadata + */ + sort: string +} +/** + * + * @export + * @interface ModuleStats + */ +export interface ModuleStats { + /** + * + * @type {string} + * @memberof ModuleStats + */ + name: string + /** + * + * @type {string} + * @memberof ModuleStats + */ + id: string + /** + * + * @type {number} + * @memberof ModuleStats + */ + occurences: number + /** + * + * @type {number} + * @memberof ModuleStats + */ + operators: number + /** + * + * @type {number} + * @memberof ModuleStats + */ + host_duration: number + /** + * + * @type {number} + * @memberof ModuleStats + */ + self_host_duration: number + /** + * + * @type {number} + * @memberof ModuleStats + */ + device_duration: number + /** + * + * @type {number} + * @memberof ModuleStats + */ + self_device_duration: number + /** + * + * @type {number} + * @memberof ModuleStats + */ + avg_duration: number + /** + * + * @type {Array} + * @memberof ModuleStats + */ + children: Array +} +/** + * + * @export + * @interface ModuleViewData + */ +export interface ModuleViewData { + /** + * + * @type {Array} + * @memberof ModuleViewData + */ + columns: Array + /** + * + * @type {Array} + * @memberof ModuleViewData + */ + data: Array +} +/** + * + * @export + * @interface OpAgg + */ +export interface OpAgg { + /** + * + * @type {string} + * @memberof OpAgg + */ + name: string + /** + * + * @type {number} + * @memberof OpAgg + */ + calls: number + /** + * + * @type {number} + * @memberof OpAgg + */ + host_duration: number + /** + * + * @type {number} + * @memberof OpAgg + */ + device_duration: number + /** + * + * @type {number} + * @memberof OpAgg + */ + self_host_duration: number + /** + * + * @type {number} + * @memberof OpAgg + */ + self_device_duration: number +} +/** + * + * @export + * @interface OpStats + */ +export interface OpStats { + /** + * + * @type {string} + * @memberof OpStats + */ + name: string + /** + * + * @type {number} + * @memberof OpStats + */ + duration: number + /** + * + * @type {number} + * @memberof OpStats + */ + device_duration: number + /** + * + * @type {number} + * @memberof OpStats + */ + total_duration: number + /** + * + * @type {Array} + * @memberof OpStats + */ + aggs: Array +} +/** + * + * @export + * @interface OperationTableData + */ +export interface OperationTableData extends Array {} +/** + * + * @export + * @interface OperationTableDataInner + */ +export interface OperationTableDataInner { + /** + * + * @type {string} + * @memberof OperationTableDataInner + */ + name: string + /** + * + * @type {string} + * @memberof OperationTableDataInner + */ + input_shape?: string + /** + * + * @type {number} + * @memberof OperationTableDataInner + */ + calls: number + /** + * + * @type {number} + * @memberof OperationTableDataInner + */ + device_self_duration?: number + /** + * + * @type {number} + * @memberof OperationTableDataInner + */ + device_total_duration?: number + /** + * + * @type {number} + * @memberof OperationTableDataInner + */ + host_self_duration: number + /** + * + * @type {number} + * @memberof OperationTableDataInner + */ + host_total_duration: number + /** + * + * @type {boolean} + * @memberof OperationTableDataInner + */ + has_call_stack: boolean + /** + * + * @type {string} + * @memberof OperationTableDataInner + */ + tc_eligible?: string + /** + * + * @type {number} + * @memberof OperationTableDataInner + */ + tc_self_ratio?: number + /** + * + * @type {number} + * @memberof OperationTableDataInner + */ + tc_total_ratio?: number +} +/** + * + * @export + * @interface OperatorGraph + */ +export interface OperatorGraph { + /** + * + * @type {Graph} + * @memberof OperatorGraph + */ + device_total_time: Graph + /** + * + * @type {Graph} + * @memberof OperatorGraph + */ + device_self_time: Graph + /** + * + * @type {Graph} + * @memberof OperatorGraph + */ + host_total_time: Graph + /** + * + * @type {Graph} + * @memberof OperatorGraph + */ + host_self_time: Graph +} +/** + * + * @export + * @interface OperatorNode + */ +export interface OperatorNode { + /** + * + * @type {string} + * @memberof OperatorNode + */ + name: string + /** + * + * @type {number} + * @memberof OperatorNode + */ + start_time: number + /** + * + * @type {number} + * @memberof OperatorNode + */ + end_time: number + /** + * + * @type {string} + * @memberof OperatorNode + */ + type: string + /** + * + * @type {number} + * @memberof OperatorNode + */ + tid: number + /** + * + * @type {Array} + * @memberof OperatorNode + */ + children: Array +} +/** + * + * @export + * @interface Overview + */ +export interface Overview { + /** + * + * @type {Array} + * @memberof Overview + */ + performance: Array + /** + * + * @type {Array} + * @memberof Overview + */ + environments: Array + /** + * + * @type {Graph} + * @memberof Overview + */ + steps: Graph + /** + * + * @type {string} + * @memberof Overview + */ + recommendations: string + /** + * + * @type {GpuMetrics} + * @memberof Overview + */ + gpu_metrics?: GpuMetrics +} +/** + * + * @export + * @interface Performance + */ +export interface Performance { + /** + * + * @type {string} + * @memberof Performance + */ + name: string + /** + * + * @type {string} + * @memberof Performance + */ + description?: string + /** + * + * @type {string} + * @memberof Performance + */ + value?: string + /** + * + * @type {string} + * @memberof Performance + */ + extra?: string + /** + * + * @type {Array} + * @memberof Performance + */ + children?: Array +} +/** + * + * @export + * @interface Runs + */ +export interface Runs { + /** + * + * @type {Array} + * @memberof Runs + */ + runs: Array + /** + * + * @type {boolean} + * @memberof Runs + */ + loading: boolean +} +/** + * + * @export + * @interface TableData + */ +export interface TableData { + /** + * + * @type {Graph} + * @memberof TableData + */ + data: Graph + /** + * + * @type {TableMetadata} + * @memberof TableData + */ + metadata: TableMetadata +} +/** + * + * @export + * @interface TableMetadata + */ +export interface TableMetadata { + /** + * + * @type {string} + * @memberof TableMetadata + */ + sort: string + /** + * + * @type {any} + * @memberof TableMetadata + */ + tooltips?: any +} +/** + * + * @export + * @interface TensorCoresGraph + */ +export interface TensorCoresGraph { + /** + * + * @type {Graph} + * @memberof TensorCoresGraph + */ + total: Graph +} +/** + * + * @export + * @interface ValueAndFormat + */ +export interface ValueAndFormat { + /** + * + * @type {string | number | boolean} + * @memberof ValueAndFormat + */ + v: string | number | boolean + /** + * + * @type {string} + * @memberof ValueAndFormat + */ + f: string +} +/** + * DefaultApi - fetch parameter creator + * @export + */ +export const DefaultApiFetchParamCreator = function ( + configuration?: Configuration +) { + return { + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {string} exp_run + * @param {string} exp_worker + * @param {string} exp_span + * @param {string} [path] + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + diffnodeGet( + run: string, + worker: string, + span: string, + exp_run: string, + exp_worker: string, + exp_span: string, + path?: string, + options: any = {} + ): FetchArgs { + // verify required parameter 'run' is not null or undefined + if (run === null || run === undefined) { + throw new RequiredError( + 'run', + 'Required parameter run was null or undefined when calling diffnodeGet.' + ) + } + // verify required parameter 'worker' is not null or undefined + if (worker === null || worker === undefined) { + throw new RequiredError( + 'worker', + 'Required parameter worker was null or undefined when calling diffnodeGet.' + ) + } + // verify required parameter 'span' is not null or undefined + if (span === null || span === undefined) { + throw new RequiredError( + 'span', + 'Required parameter span was null or undefined when calling diffnodeGet.' + ) + } + // verify required parameter 'exp_run' is not null or undefined + if (exp_run === null || exp_run === undefined) { + throw new RequiredError( + 'exp_run', + 'Required parameter exp_run was null or undefined when calling diffnodeGet.' + ) + } + // verify required parameter 'exp_worker' is not null or undefined + if (exp_worker === null || exp_worker === undefined) { + throw new RequiredError( + 'exp_worker', + 'Required parameter exp_worker was null or undefined when calling diffnodeGet.' + ) + } + // verify required parameter 'exp_span' is not null or undefined + if (exp_span === null || exp_span === undefined) { + throw new RequiredError( + 'exp_span', + 'Required parameter exp_span was null or undefined when calling diffnodeGet.' + ) + } + const localVarPath = `/diffnode` + const localVarUrlObj = url.parse(localVarPath, true) + const localVarRequestOptions = Object.assign({ method: 'GET' }, options) + const localVarHeaderParameter = {} as any + const localVarQueryParameter = {} as any + + if (run !== undefined) { + localVarQueryParameter['run'] = run + } + + if (worker !== undefined) { + localVarQueryParameter['worker'] = worker + } + + if (span !== undefined) { + localVarQueryParameter['span'] = span + } + + if (exp_run !== undefined) { + localVarQueryParameter['exp_run'] = exp_run + } + + if (exp_worker !== undefined) { + localVarQueryParameter['exp_worker'] = exp_worker + } + + if (exp_span !== undefined) { + localVarQueryParameter['exp_span'] = exp_span + } + + if (path !== undefined) { + localVarQueryParameter['path'] = path + } + + localVarUrlObj.query = Object.assign( + {}, + localVarUrlObj.query, + localVarQueryParameter, + options.query + ) + // fix override query string Detail: https://stackoverflow.com/a/7517673/1077943 + delete localVarUrlObj.search + localVarRequestOptions.headers = Object.assign( + {}, + localVarHeaderParameter, + options.headers + ) + + return { + url: url.format(localVarUrlObj), + options: localVarRequestOptions + } + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + distributedCommopsGet( + run: string, + worker: string, + span: string, + options: any = {} + ): FetchArgs { + // verify required parameter 'run' is not null or undefined + if (run === null || run === undefined) { + throw new RequiredError( + 'run', + 'Required parameter run was null or undefined when calling distributedCommopsGet.' + ) + } + // verify required parameter 'worker' is not null or undefined + if (worker === null || worker === undefined) { + throw new RequiredError( + 'worker', + 'Required parameter worker was null or undefined when calling distributedCommopsGet.' + ) + } + // verify required parameter 'span' is not null or undefined + if (span === null || span === undefined) { + throw new RequiredError( + 'span', + 'Required parameter span was null or undefined when calling distributedCommopsGet.' + ) + } + const localVarPath = `/distributed/commops` + const localVarUrlObj = url.parse(localVarPath, true) + const localVarRequestOptions = Object.assign({ method: 'GET' }, options) + const localVarHeaderParameter = {} as any + const localVarQueryParameter = {} as any + + if (run !== undefined) { + localVarQueryParameter['run'] = run + } + + if (worker !== undefined) { + localVarQueryParameter['worker'] = worker + } + + if (span !== undefined) { + localVarQueryParameter['span'] = span + } + + localVarUrlObj.query = Object.assign( + {}, + localVarUrlObj.query, + localVarQueryParameter, + options.query + ) + // fix override query string Detail: https://stackoverflow.com/a/7517673/1077943 + delete localVarUrlObj.search + localVarRequestOptions.headers = Object.assign( + {}, + localVarHeaderParameter, + options.headers + ) + + return { + url: url.format(localVarUrlObj), + options: localVarRequestOptions + } + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + distributedGpuinfoGet( + run: string, + worker: string, + span: string, + options: any = {} + ): FetchArgs { + // verify required parameter 'run' is not null or undefined + if (run === null || run === undefined) { + throw new RequiredError( + 'run', + 'Required parameter run was null or undefined when calling distributedGpuinfoGet.' + ) + } + // verify required parameter 'worker' is not null or undefined + if (worker === null || worker === undefined) { + throw new RequiredError( + 'worker', + 'Required parameter worker was null or undefined when calling distributedGpuinfoGet.' + ) + } + // verify required parameter 'span' is not null or undefined + if (span === null || span === undefined) { + throw new RequiredError( + 'span', + 'Required parameter span was null or undefined when calling distributedGpuinfoGet.' + ) + } + const localVarPath = `/distributed/gpuinfo` + const localVarUrlObj = url.parse(localVarPath, true) + const localVarRequestOptions = Object.assign({ method: 'GET' }, options) + const localVarHeaderParameter = {} as any + const localVarQueryParameter = {} as any + + if (run !== undefined) { + localVarQueryParameter['run'] = run + } + + if (worker !== undefined) { + localVarQueryParameter['worker'] = worker + } + + if (span !== undefined) { + localVarQueryParameter['span'] = span + } + + localVarUrlObj.query = Object.assign( + {}, + localVarUrlObj.query, + localVarQueryParameter, + options.query + ) + // fix override query string Detail: https://stackoverflow.com/a/7517673/1077943 + delete localVarUrlObj.search + localVarRequestOptions.headers = Object.assign( + {}, + localVarHeaderParameter, + options.headers + ) + + return { + url: url.format(localVarUrlObj), + options: localVarRequestOptions + } + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + distributedOverlapGet( + run: string, + worker: string, + span: string, + options: any = {} + ): FetchArgs { + // verify required parameter 'run' is not null or undefined + if (run === null || run === undefined) { + throw new RequiredError( + 'run', + 'Required parameter run was null or undefined when calling distributedOverlapGet.' + ) + } + // verify required parameter 'worker' is not null or undefined + if (worker === null || worker === undefined) { + throw new RequiredError( + 'worker', + 'Required parameter worker was null or undefined when calling distributedOverlapGet.' + ) + } + // verify required parameter 'span' is not null or undefined + if (span === null || span === undefined) { + throw new RequiredError( + 'span', + 'Required parameter span was null or undefined when calling distributedOverlapGet.' + ) + } + const localVarPath = `/distributed/overlap` + const localVarUrlObj = url.parse(localVarPath, true) + const localVarRequestOptions = Object.assign({ method: 'GET' }, options) + const localVarHeaderParameter = {} as any + const localVarQueryParameter = {} as any + + if (run !== undefined) { + localVarQueryParameter['run'] = run + } + + if (worker !== undefined) { + localVarQueryParameter['worker'] = worker + } + + if (span !== undefined) { + localVarQueryParameter['span'] = span + } + + localVarUrlObj.query = Object.assign( + {}, + localVarUrlObj.query, + localVarQueryParameter, + options.query + ) + // fix override query string Detail: https://stackoverflow.com/a/7517673/1077943 + delete localVarUrlObj.search + localVarRequestOptions.headers = Object.assign( + {}, + localVarHeaderParameter, + options.headers + ) + + return { + url: url.format(localVarUrlObj), + options: localVarRequestOptions + } + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + distributedWaittimeGet( + run: string, + worker: string, + span: string, + options: any = {} + ): FetchArgs { + // verify required parameter 'run' is not null or undefined + if (run === null || run === undefined) { + throw new RequiredError( + 'run', + 'Required parameter run was null or undefined when calling distributedWaittimeGet.' + ) + } + // verify required parameter 'worker' is not null or undefined + if (worker === null || worker === undefined) { + throw new RequiredError( + 'worker', + 'Required parameter worker was null or undefined when calling distributedWaittimeGet.' + ) + } + // verify required parameter 'span' is not null or undefined + if (span === null || span === undefined) { + throw new RequiredError( + 'span', + 'Required parameter span was null or undefined when calling distributedWaittimeGet.' + ) + } + const localVarPath = `/distributed/waittime` + const localVarUrlObj = url.parse(localVarPath, true) + const localVarRequestOptions = Object.assign({ method: 'GET' }, options) + const localVarHeaderParameter = {} as any + const localVarQueryParameter = {} as any + + if (run !== undefined) { + localVarQueryParameter['run'] = run + } + + if (worker !== undefined) { + localVarQueryParameter['worker'] = worker + } + + if (span !== undefined) { + localVarQueryParameter['span'] = span + } + + localVarUrlObj.query = Object.assign( + {}, + localVarUrlObj.query, + localVarQueryParameter, + options.query + ) + // fix override query string Detail: https://stackoverflow.com/a/7517673/1077943 + delete localVarUrlObj.search + localVarRequestOptions.headers = Object.assign( + {}, + localVarHeaderParameter, + options.headers + ) + + return { + url: url.format(localVarUrlObj), + options: localVarRequestOptions + } + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {string} group_by Group By + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + kernelGet( + run: string, + worker: string, + span: string, + group_by: string, + options: any = {} + ): FetchArgs { + // verify required parameter 'run' is not null or undefined + if (run === null || run === undefined) { + throw new RequiredError( + 'run', + 'Required parameter run was null or undefined when calling kernelGet.' + ) + } + // verify required parameter 'worker' is not null or undefined + if (worker === null || worker === undefined) { + throw new RequiredError( + 'worker', + 'Required parameter worker was null or undefined when calling kernelGet.' + ) + } + // verify required parameter 'span' is not null or undefined + if (span === null || span === undefined) { + throw new RequiredError( + 'span', + 'Required parameter span was null or undefined when calling kernelGet.' + ) + } + // verify required parameter 'group_by' is not null or undefined + if (group_by === null || group_by === undefined) { + throw new RequiredError( + 'group_by', + 'Required parameter group_by was null or undefined when calling kernelGet.' + ) + } + const localVarPath = `/kernel` + const localVarUrlObj = url.parse(localVarPath, true) + const localVarRequestOptions = Object.assign({ method: 'GET' }, options) + const localVarHeaderParameter = {} as any + const localVarQueryParameter = {} as any + + if (run !== undefined) { + localVarQueryParameter['run'] = run + } + + if (worker !== undefined) { + localVarQueryParameter['worker'] = worker + } + + if (span !== undefined) { + localVarQueryParameter['span'] = span + } + + if (group_by !== undefined) { + localVarQueryParameter['group_by'] = group_by + } + + localVarUrlObj.query = Object.assign( + {}, + localVarUrlObj.query, + localVarQueryParameter, + options.query + ) + // fix override query string Detail: https://stackoverflow.com/a/7517673/1077943 + delete localVarUrlObj.search + localVarRequestOptions.headers = Object.assign( + {}, + localVarHeaderParameter, + options.headers + ) + + return { + url: url.format(localVarUrlObj), + options: localVarRequestOptions + } + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {string} [group_by] Group By + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + kernelTableGet( + run: string, + worker: string, + span: string, + group_by?: string, + options: any = {} + ): FetchArgs { + // verify required parameter 'run' is not null or undefined + if (run === null || run === undefined) { + throw new RequiredError( + 'run', + 'Required parameter run was null or undefined when calling kernelTableGet.' + ) + } + // verify required parameter 'worker' is not null or undefined + if (worker === null || worker === undefined) { + throw new RequiredError( + 'worker', + 'Required parameter worker was null or undefined when calling kernelTableGet.' + ) + } + // verify required parameter 'span' is not null or undefined + if (span === null || span === undefined) { + throw new RequiredError( + 'span', + 'Required parameter span was null or undefined when calling kernelTableGet.' + ) + } + const localVarPath = `/kernel/table` + const localVarUrlObj = url.parse(localVarPath, true) + const localVarRequestOptions = Object.assign({ method: 'GET' }, options) + const localVarHeaderParameter = {} as any + const localVarQueryParameter = {} as any + + if (run !== undefined) { + localVarQueryParameter['run'] = run + } + + if (worker !== undefined) { + localVarQueryParameter['worker'] = worker + } + + if (span !== undefined) { + localVarQueryParameter['span'] = span + } + + if (group_by !== undefined) { + localVarQueryParameter['group_by'] = group_by + } + + localVarUrlObj.query = Object.assign( + {}, + localVarUrlObj.query, + localVarQueryParameter, + options.query + ) + // fix override query string Detail: https://stackoverflow.com/a/7517673/1077943 + delete localVarUrlObj.search + localVarRequestOptions.headers = Object.assign( + {}, + localVarHeaderParameter, + options.headers + ) + + return { + url: url.format(localVarUrlObj), + options: localVarRequestOptions + } + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + kernelTcPieGet( + run: string, + worker: string, + span: string, + options: any = {} + ): FetchArgs { + // verify required parameter 'run' is not null or undefined + if (run === null || run === undefined) { + throw new RequiredError( + 'run', + 'Required parameter run was null or undefined when calling kernelTcPieGet.' + ) + } + // verify required parameter 'worker' is not null or undefined + if (worker === null || worker === undefined) { + throw new RequiredError( + 'worker', + 'Required parameter worker was null or undefined when calling kernelTcPieGet.' + ) + } + // verify required parameter 'span' is not null or undefined + if (span === null || span === undefined) { + throw new RequiredError( + 'span', + 'Required parameter span was null or undefined when calling kernelTcPieGet.' + ) + } + const localVarPath = `/kernel/tc_pie` + const localVarUrlObj = url.parse(localVarPath, true) + const localVarRequestOptions = Object.assign({ method: 'GET' }, options) + const localVarHeaderParameter = {} as any + const localVarQueryParameter = {} as any + + if (run !== undefined) { + localVarQueryParameter['run'] = run + } + + if (worker !== undefined) { + localVarQueryParameter['worker'] = worker + } + + if (span !== undefined) { + localVarQueryParameter['span'] = span + } + + localVarUrlObj.query = Object.assign( + {}, + localVarUrlObj.query, + localVarQueryParameter, + options.query + ) + // fix override query string Detail: https://stackoverflow.com/a/7517673/1077943 + delete localVarUrlObj.search + localVarRequestOptions.headers = Object.assign( + {}, + localVarHeaderParameter, + options.headers + ) + + return { + url: url.format(localVarUrlObj), + options: localVarRequestOptions + } + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + memoryCurveGet( + run: string, + worker: string, + span: string, + options: any = {} + ): FetchArgs { + // verify required parameter 'run' is not null or undefined + if (run === null || run === undefined) { + throw new RequiredError( + 'run', + 'Required parameter run was null or undefined when calling memoryCurveGet.' + ) + } + // verify required parameter 'worker' is not null or undefined + if (worker === null || worker === undefined) { + throw new RequiredError( + 'worker', + 'Required parameter worker was null or undefined when calling memoryCurveGet.' + ) + } + // verify required parameter 'span' is not null or undefined + if (span === null || span === undefined) { + throw new RequiredError( + 'span', + 'Required parameter span was null or undefined when calling memoryCurveGet.' + ) + } + const localVarPath = `/memory_curve` + const localVarUrlObj = url.parse(localVarPath, true) + const localVarRequestOptions = Object.assign({ method: 'GET' }, options) + const localVarHeaderParameter = {} as any + const localVarQueryParameter = {} as any + + if (run !== undefined) { + localVarQueryParameter['run'] = run + } + + if (worker !== undefined) { + localVarQueryParameter['worker'] = worker + } + + if (span !== undefined) { + localVarQueryParameter['span'] = span + } + + localVarUrlObj.query = Object.assign( + {}, + localVarUrlObj.query, + localVarQueryParameter, + options.query + ) + // fix override query string Detail: https://stackoverflow.com/a/7517673/1077943 + delete localVarUrlObj.search + localVarRequestOptions.headers = Object.assign( + {}, + localVarHeaderParameter, + options.headers + ) + + return { + url: url.format(localVarUrlObj), + options: localVarRequestOptions + } + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {number} [start_ts] + * @param {number} [end_ts] + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + memoryEventsGet( + run: string, + worker: string, + span: string, + start_ts?: number, + end_ts?: number, + options: any = {} + ): FetchArgs { + // verify required parameter 'run' is not null or undefined + if (run === null || run === undefined) { + throw new RequiredError( + 'run', + 'Required parameter run was null or undefined when calling memoryEventsGet.' + ) + } + // verify required parameter 'worker' is not null or undefined + if (worker === null || worker === undefined) { + throw new RequiredError( + 'worker', + 'Required parameter worker was null or undefined when calling memoryEventsGet.' + ) + } + // verify required parameter 'span' is not null or undefined + if (span === null || span === undefined) { + throw new RequiredError( + 'span', + 'Required parameter span was null or undefined when calling memoryEventsGet.' + ) + } + const localVarPath = `/memory_events` + const localVarUrlObj = url.parse(localVarPath, true) + const localVarRequestOptions = Object.assign({ method: 'GET' }, options) + const localVarHeaderParameter = {} as any + const localVarQueryParameter = {} as any + + if (run !== undefined) { + localVarQueryParameter['run'] = run + } + + if (worker !== undefined) { + localVarQueryParameter['worker'] = worker + } + + if (span !== undefined) { + localVarQueryParameter['span'] = span + } + + if (start_ts !== undefined) { + localVarQueryParameter['start_ts'] = start_ts + } + + if (end_ts !== undefined) { + localVarQueryParameter['end_ts'] = end_ts + } + + localVarUrlObj.query = Object.assign( + {}, + localVarUrlObj.query, + localVarQueryParameter, + options.query + ) + // fix override query string Detail: https://stackoverflow.com/a/7517673/1077943 + delete localVarUrlObj.search + localVarRequestOptions.headers = Object.assign( + {}, + localVarHeaderParameter, + options.headers + ) + + return { + url: url.format(localVarUrlObj), + options: localVarRequestOptions + } + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {number} [start_ts] + * @param {number} [end_ts] + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + memoryGet( + run: string, + worker: string, + span: string, + start_ts?: number, + end_ts?: number, + options: any = {} + ): FetchArgs { + // verify required parameter 'run' is not null or undefined + if (run === null || run === undefined) { + throw new RequiredError( + 'run', + 'Required parameter run was null or undefined when calling memoryGet.' + ) + } + // verify required parameter 'worker' is not null or undefined + if (worker === null || worker === undefined) { + throw new RequiredError( + 'worker', + 'Required parameter worker was null or undefined when calling memoryGet.' + ) + } + // verify required parameter 'span' is not null or undefined + if (span === null || span === undefined) { + throw new RequiredError( + 'span', + 'Required parameter span was null or undefined when calling memoryGet.' + ) + } + const localVarPath = `/memory` + const localVarUrlObj = url.parse(localVarPath, true) + const localVarRequestOptions = Object.assign({ method: 'GET' }, options) + const localVarHeaderParameter = {} as any + const localVarQueryParameter = {} as any + + if (run !== undefined) { + localVarQueryParameter['run'] = run + } + + if (worker !== undefined) { + localVarQueryParameter['worker'] = worker + } + + if (span !== undefined) { + localVarQueryParameter['span'] = span + } + + if (start_ts !== undefined) { + localVarQueryParameter['start_ts'] = start_ts + } + + if (end_ts !== undefined) { + localVarQueryParameter['end_ts'] = end_ts + } + + localVarUrlObj.query = Object.assign( + {}, + localVarUrlObj.query, + localVarQueryParameter, + options.query + ) + // fix override query string Detail: https://stackoverflow.com/a/7517673/1077943 + delete localVarUrlObj.search + localVarRequestOptions.headers = Object.assign( + {}, + localVarHeaderParameter, + options.headers + ) + + return { + url: url.format(localVarUrlObj), + options: localVarRequestOptions + } + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + moduleGet( + run: string, + worker: string, + span: string, + options: any = {} + ): FetchArgs { + // verify required parameter 'run' is not null or undefined + if (run === null || run === undefined) { + throw new RequiredError( + 'run', + 'Required parameter run was null or undefined when calling moduleGet.' + ) + } + // verify required parameter 'worker' is not null or undefined + if (worker === null || worker === undefined) { + throw new RequiredError( + 'worker', + 'Required parameter worker was null or undefined when calling moduleGet.' + ) + } + // verify required parameter 'span' is not null or undefined + if (span === null || span === undefined) { + throw new RequiredError( + 'span', + 'Required parameter span was null or undefined when calling moduleGet.' + ) + } + const localVarPath = `/module` + const localVarUrlObj = url.parse(localVarPath, true) + const localVarRequestOptions = Object.assign({ method: 'GET' }, options) + const localVarHeaderParameter = {} as any + const localVarQueryParameter = {} as any + + if (run !== undefined) { + localVarQueryParameter['run'] = run + } + + if (worker !== undefined) { + localVarQueryParameter['worker'] = worker + } + + if (span !== undefined) { + localVarQueryParameter['span'] = span + } + + localVarUrlObj.query = Object.assign( + {}, + localVarUrlObj.query, + localVarQueryParameter, + options.query + ) + // fix override query string Detail: https://stackoverflow.com/a/7517673/1077943 + delete localVarUrlObj.search + localVarRequestOptions.headers = Object.assign( + {}, + localVarHeaderParameter, + options.headers + ) + + return { + url: url.format(localVarUrlObj), + options: localVarRequestOptions + } + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {string} group_by Group By + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + operationGet( + run: string, + worker: string, + span: string, + group_by: string, + options: any = {} + ): FetchArgs { + // verify required parameter 'run' is not null or undefined + if (run === null || run === undefined) { + throw new RequiredError( + 'run', + 'Required parameter run was null or undefined when calling operationGet.' + ) + } + // verify required parameter 'worker' is not null or undefined + if (worker === null || worker === undefined) { + throw new RequiredError( + 'worker', + 'Required parameter worker was null or undefined when calling operationGet.' + ) + } + // verify required parameter 'span' is not null or undefined + if (span === null || span === undefined) { + throw new RequiredError( + 'span', + 'Required parameter span was null or undefined when calling operationGet.' + ) + } + // verify required parameter 'group_by' is not null or undefined + if (group_by === null || group_by === undefined) { + throw new RequiredError( + 'group_by', + 'Required parameter group_by was null or undefined when calling operationGet.' + ) + } + const localVarPath = `/operation` + const localVarUrlObj = url.parse(localVarPath, true) + const localVarRequestOptions = Object.assign({ method: 'GET' }, options) + const localVarHeaderParameter = {} as any + const localVarQueryParameter = {} as any + + if (run !== undefined) { + localVarQueryParameter['run'] = run + } + + if (worker !== undefined) { + localVarQueryParameter['worker'] = worker + } + + if (span !== undefined) { + localVarQueryParameter['span'] = span + } + + if (group_by !== undefined) { + localVarQueryParameter['group_by'] = group_by + } + + localVarUrlObj.query = Object.assign( + {}, + localVarUrlObj.query, + localVarQueryParameter, + options.query + ) + // fix override query string Detail: https://stackoverflow.com/a/7517673/1077943 + delete localVarUrlObj.search + localVarRequestOptions.headers = Object.assign( + {}, + localVarHeaderParameter, + options.headers + ) + + return { + url: url.format(localVarUrlObj), + options: localVarRequestOptions + } + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {string} group_by Group By + * @param {string} op_name + * @param {string} [input_shape] + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + operationStackGet( + run: string, + worker: string, + span: string, + group_by: string, + op_name: string, + input_shape?: string, + options: any = {} + ): FetchArgs { + // verify required parameter 'run' is not null or undefined + if (run === null || run === undefined) { + throw new RequiredError( + 'run', + 'Required parameter run was null or undefined when calling operationStackGet.' + ) + } + // verify required parameter 'worker' is not null or undefined + if (worker === null || worker === undefined) { + throw new RequiredError( + 'worker', + 'Required parameter worker was null or undefined when calling operationStackGet.' + ) + } + // verify required parameter 'span' is not null or undefined + if (span === null || span === undefined) { + throw new RequiredError( + 'span', + 'Required parameter span was null or undefined when calling operationStackGet.' + ) + } + // verify required parameter 'group_by' is not null or undefined + if (group_by === null || group_by === undefined) { + throw new RequiredError( + 'group_by', + 'Required parameter group_by was null or undefined when calling operationStackGet.' + ) + } + // verify required parameter 'op_name' is not null or undefined + if (op_name === null || op_name === undefined) { + throw new RequiredError( + 'op_name', + 'Required parameter op_name was null or undefined when calling operationStackGet.' + ) + } + const localVarPath = `/operation/stack` + const localVarUrlObj = url.parse(localVarPath, true) + const localVarRequestOptions = Object.assign({ method: 'GET' }, options) + const localVarHeaderParameter = {} as any + const localVarQueryParameter = {} as any + + if (run !== undefined) { + localVarQueryParameter['run'] = run + } + + if (worker !== undefined) { + localVarQueryParameter['worker'] = worker + } + + if (span !== undefined) { + localVarQueryParameter['span'] = span + } + + if (group_by !== undefined) { + localVarQueryParameter['group_by'] = group_by + } + + if (op_name !== undefined) { + localVarQueryParameter['op_name'] = op_name + } + + if (input_shape !== undefined) { + localVarQueryParameter['input_shape'] = input_shape + } + + localVarUrlObj.query = Object.assign( + {}, + localVarUrlObj.query, + localVarQueryParameter, + options.query + ) + // fix override query string Detail: https://stackoverflow.com/a/7517673/1077943 + delete localVarUrlObj.search + localVarRequestOptions.headers = Object.assign( + {}, + localVarHeaderParameter, + options.headers + ) + + return { + url: url.format(localVarUrlObj), + options: localVarRequestOptions + } + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {string} group_by Group By + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + operationTableGet( + run: string, + worker: string, + span: string, + group_by: string, + options: any = {} + ): FetchArgs { + // verify required parameter 'run' is not null or undefined + if (run === null || run === undefined) { + throw new RequiredError( + 'run', + 'Required parameter run was null or undefined when calling operationTableGet.' + ) + } + // verify required parameter 'worker' is not null or undefined + if (worker === null || worker === undefined) { + throw new RequiredError( + 'worker', + 'Required parameter worker was null or undefined when calling operationTableGet.' + ) + } + // verify required parameter 'span' is not null or undefined + if (span === null || span === undefined) { + throw new RequiredError( + 'span', + 'Required parameter span was null or undefined when calling operationTableGet.' + ) + } + // verify required parameter 'group_by' is not null or undefined + if (group_by === null || group_by === undefined) { + throw new RequiredError( + 'group_by', + 'Required parameter group_by was null or undefined when calling operationTableGet.' + ) + } + const localVarPath = `/operation/table` + const localVarUrlObj = url.parse(localVarPath, true) + const localVarRequestOptions = Object.assign({ method: 'GET' }, options) + const localVarHeaderParameter = {} as any + const localVarQueryParameter = {} as any + + if (run !== undefined) { + localVarQueryParameter['run'] = run + } + + if (worker !== undefined) { + localVarQueryParameter['worker'] = worker + } + + if (span !== undefined) { + localVarQueryParameter['span'] = span + } + + if (group_by !== undefined) { + localVarQueryParameter['group_by'] = group_by + } + + localVarUrlObj.query = Object.assign( + {}, + localVarUrlObj.query, + localVarQueryParameter, + options.query + ) + // fix override query string Detail: https://stackoverflow.com/a/7517673/1077943 + delete localVarUrlObj.search + localVarRequestOptions.headers = Object.assign( + {}, + localVarHeaderParameter, + options.headers + ) + + return { + url: url.format(localVarUrlObj), + options: localVarRequestOptions + } + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + overviewGet( + run: string, + worker: string, + span: string, + options: any = {} + ): FetchArgs { + // verify required parameter 'run' is not null or undefined + if (run === null || run === undefined) { + throw new RequiredError( + 'run', + 'Required parameter run was null or undefined when calling overviewGet.' + ) + } + // verify required parameter 'worker' is not null or undefined + if (worker === null || worker === undefined) { + throw new RequiredError( + 'worker', + 'Required parameter worker was null or undefined when calling overviewGet.' + ) + } + // verify required parameter 'span' is not null or undefined + if (span === null || span === undefined) { + throw new RequiredError( + 'span', + 'Required parameter span was null or undefined when calling overviewGet.' + ) + } + const localVarPath = `/overview` + const localVarUrlObj = url.parse(localVarPath, true) + const localVarRequestOptions = Object.assign({ method: 'GET' }, options) + const localVarHeaderParameter = {} as any + const localVarQueryParameter = {} as any + + if (run !== undefined) { + localVarQueryParameter['run'] = run + } + + if (worker !== undefined) { + localVarQueryParameter['worker'] = worker + } + + if (span !== undefined) { + localVarQueryParameter['span'] = span + } + + localVarUrlObj.query = Object.assign( + {}, + localVarUrlObj.query, + localVarQueryParameter, + options.query + ) + // fix override query string Detail: https://stackoverflow.com/a/7517673/1077943 + delete localVarUrlObj.search + localVarRequestOptions.headers = Object.assign( + {}, + localVarHeaderParameter, + options.headers + ) + + return { + url: url.format(localVarUrlObj), + options: localVarRequestOptions + } + }, + /** + * + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + runsGet(options: any = {}): FetchArgs { + const localVarPath = `/runs` + const localVarUrlObj = url.parse(localVarPath, true) + const localVarRequestOptions = Object.assign({ method: 'GET' }, options) + const localVarHeaderParameter = {} as any + const localVarQueryParameter = {} as any + + localVarUrlObj.query = Object.assign( + {}, + localVarUrlObj.query, + localVarQueryParameter, + options.query + ) + // fix override query string Detail: https://stackoverflow.com/a/7517673/1077943 + delete localVarUrlObj.search + localVarRequestOptions.headers = Object.assign( + {}, + localVarHeaderParameter, + options.headers + ) + + return { + url: url.format(localVarUrlObj), + options: localVarRequestOptions + } + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + spansGet(run: string, worker: string, options: any = {}): FetchArgs { + // verify required parameter 'run' is not null or undefined + if (run === null || run === undefined) { + throw new RequiredError( + 'run', + 'Required parameter run was null or undefined when calling spansGet.' + ) + } + // verify required parameter 'worker' is not null or undefined + if (worker === null || worker === undefined) { + throw new RequiredError( + 'worker', + 'Required parameter worker was null or undefined when calling spansGet.' + ) + } + const localVarPath = `/spans` + const localVarUrlObj = url.parse(localVarPath, true) + const localVarRequestOptions = Object.assign({ method: 'GET' }, options) + const localVarHeaderParameter = {} as any + const localVarQueryParameter = {} as any + + if (run !== undefined) { + localVarQueryParameter['run'] = run + } + + if (worker !== undefined) { + localVarQueryParameter['worker'] = worker + } + + localVarUrlObj.query = Object.assign( + {}, + localVarUrlObj.query, + localVarQueryParameter, + options.query + ) + // fix override query string Detail: https://stackoverflow.com/a/7517673/1077943 + delete localVarUrlObj.search + localVarRequestOptions.headers = Object.assign( + {}, + localVarHeaderParameter, + options.headers + ) + + return { + url: url.format(localVarUrlObj), + options: localVarRequestOptions + } + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + traceGet( + run: string, + worker: string, + span: string, + options: any = {} + ): FetchArgs { + // verify required parameter 'run' is not null or undefined + if (run === null || run === undefined) { + throw new RequiredError( + 'run', + 'Required parameter run was null or undefined when calling traceGet.' + ) + } + // verify required parameter 'worker' is not null or undefined + if (worker === null || worker === undefined) { + throw new RequiredError( + 'worker', + 'Required parameter worker was null or undefined when calling traceGet.' + ) + } + // verify required parameter 'span' is not null or undefined + if (span === null || span === undefined) { + throw new RequiredError( + 'span', + 'Required parameter span was null or undefined when calling traceGet.' + ) + } + const localVarPath = `/trace` + const localVarUrlObj = url.parse(localVarPath, true) + const localVarRequestOptions = Object.assign({ method: 'GET' }, options) + const localVarHeaderParameter = {} as any + const localVarQueryParameter = {} as any + + if (run !== undefined) { + localVarQueryParameter['run'] = run + } + + if (worker !== undefined) { + localVarQueryParameter['worker'] = worker + } + + if (span !== undefined) { + localVarQueryParameter['span'] = span + } + + localVarUrlObj.query = Object.assign( + {}, + localVarUrlObj.query, + localVarQueryParameter, + options.query + ) + // fix override query string Detail: https://stackoverflow.com/a/7517673/1077943 + delete localVarUrlObj.search + localVarRequestOptions.headers = Object.assign( + {}, + localVarHeaderParameter, + options.headers + ) + + return { + url: url.format(localVarUrlObj), + options: localVarRequestOptions + } + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + treeGet( + run: string, + worker: string, + span: string, + options: any = {} + ): FetchArgs { + // verify required parameter 'run' is not null or undefined + if (run === null || run === undefined) { + throw new RequiredError( + 'run', + 'Required parameter run was null or undefined when calling treeGet.' + ) + } + // verify required parameter 'worker' is not null or undefined + if (worker === null || worker === undefined) { + throw new RequiredError( + 'worker', + 'Required parameter worker was null or undefined when calling treeGet.' + ) + } + // verify required parameter 'span' is not null or undefined + if (span === null || span === undefined) { + throw new RequiredError( + 'span', + 'Required parameter span was null or undefined when calling treeGet.' + ) + } + const localVarPath = `/tree` + const localVarUrlObj = url.parse(localVarPath, true) + const localVarRequestOptions = Object.assign({ method: 'GET' }, options) + const localVarHeaderParameter = {} as any + const localVarQueryParameter = {} as any + + if (run !== undefined) { + localVarQueryParameter['run'] = run + } + + if (worker !== undefined) { + localVarQueryParameter['worker'] = worker + } + + if (span !== undefined) { + localVarQueryParameter['span'] = span + } + + localVarUrlObj.query = Object.assign( + {}, + localVarUrlObj.query, + localVarQueryParameter, + options.query + ) + // fix override query string Detail: https://stackoverflow.com/a/7517673/1077943 + delete localVarUrlObj.search + localVarRequestOptions.headers = Object.assign( + {}, + localVarHeaderParameter, + options.headers + ) + + return { + url: url.format(localVarUrlObj), + options: localVarRequestOptions + } + }, + /** + * + * @param {string} run + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + viewsGet(run: string, options: any = {}): FetchArgs { + // verify required parameter 'run' is not null or undefined + if (run === null || run === undefined) { + throw new RequiredError( + 'run', + 'Required parameter run was null or undefined when calling viewsGet.' + ) + } + const localVarPath = `/views` + const localVarUrlObj = url.parse(localVarPath, true) + const localVarRequestOptions = Object.assign({ method: 'GET' }, options) + const localVarHeaderParameter = {} as any + const localVarQueryParameter = {} as any + + if (run !== undefined) { + localVarQueryParameter['run'] = run + } + + localVarUrlObj.query = Object.assign( + {}, + localVarUrlObj.query, + localVarQueryParameter, + options.query + ) + // fix override query string Detail: https://stackoverflow.com/a/7517673/1077943 + delete localVarUrlObj.search + localVarRequestOptions.headers = Object.assign( + {}, + localVarHeaderParameter, + options.headers + ) + + return { + url: url.format(localVarUrlObj), + options: localVarRequestOptions + } + }, + /** + * + * @param {string} run + * @param {string} view + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + workersGet(run: string, view: string, options: any = {}): FetchArgs { + // verify required parameter 'run' is not null or undefined + if (run === null || run === undefined) { + throw new RequiredError( + 'run', + 'Required parameter run was null or undefined when calling workersGet.' + ) + } + // verify required parameter 'view' is not null or undefined + if (view === null || view === undefined) { + throw new RequiredError( + 'view', + 'Required parameter view was null or undefined when calling workersGet.' + ) + } + const localVarPath = `/workers` + const localVarUrlObj = url.parse(localVarPath, true) + const localVarRequestOptions = Object.assign({ method: 'GET' }, options) + const localVarHeaderParameter = {} as any + const localVarQueryParameter = {} as any + + if (run !== undefined) { + localVarQueryParameter['run'] = run + } + + if (view !== undefined) { + localVarQueryParameter['view'] = view + } + + localVarUrlObj.query = Object.assign( + {}, + localVarUrlObj.query, + localVarQueryParameter, + options.query + ) + // fix override query string Detail: https://stackoverflow.com/a/7517673/1077943 + delete localVarUrlObj.search + localVarRequestOptions.headers = Object.assign( + {}, + localVarHeaderParameter, + options.headers + ) + + return { + url: url.format(localVarUrlObj), + options: localVarRequestOptions + } + } + } +} + +/** + * DefaultApi - functional programming interface + * @export + */ +export const DefaultApiFp = function (configuration?: Configuration) { + return { + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {string} exp_run + * @param {string} exp_worker + * @param {string} exp_span + * @param {string} [path] + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + diffnodeGet( + run: string, + worker: string, + span: string, + exp_run: string, + exp_worker: string, + exp_span: string, + path?: string, + options?: any + ): (fetch?: FetchAPI, basePath?: string) => Promise { + const localVarFetchArgs = DefaultApiFetchParamCreator( + configuration + ).diffnodeGet( + run, + worker, + span, + exp_run, + exp_worker, + exp_span, + path, + options + ) + return ( + fetch: FetchAPI = portableFetch, + basePath: string = BASE_PATH + ) => { + return fetch( + basePath + localVarFetchArgs.url, + localVarFetchArgs.options + ).then((response) => { + if (response.status >= 200 && response.status < 300) { + return response.json() + } else { + throw response + } + }) + } + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + distributedCommopsGet( + run: string, + worker: string, + span: string, + options?: any + ): (fetch?: FetchAPI, basePath?: string) => Promise { + const localVarFetchArgs = DefaultApiFetchParamCreator( + configuration + ).distributedCommopsGet(run, worker, span, options) + return ( + fetch: FetchAPI = portableFetch, + basePath: string = BASE_PATH + ) => { + return fetch( + basePath + localVarFetchArgs.url, + localVarFetchArgs.options + ).then((response) => { + if (response.status >= 200 && response.status < 300) { + return response.json() + } else { + throw response + } + }) + } + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + distributedGpuinfoGet( + run: string, + worker: string, + span: string, + options?: any + ): (fetch?: FetchAPI, basePath?: string) => Promise { + const localVarFetchArgs = DefaultApiFetchParamCreator( + configuration + ).distributedGpuinfoGet(run, worker, span, options) + return ( + fetch: FetchAPI = portableFetch, + basePath: string = BASE_PATH + ) => { + return fetch( + basePath + localVarFetchArgs.url, + localVarFetchArgs.options + ).then((response) => { + if (response.status >= 200 && response.status < 300) { + return response.json() + } else { + throw response + } + }) + } + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + distributedOverlapGet( + run: string, + worker: string, + span: string, + options?: any + ): (fetch?: FetchAPI, basePath?: string) => Promise { + const localVarFetchArgs = DefaultApiFetchParamCreator( + configuration + ).distributedOverlapGet(run, worker, span, options) + return ( + fetch: FetchAPI = portableFetch, + basePath: string = BASE_PATH + ) => { + return fetch( + basePath + localVarFetchArgs.url, + localVarFetchArgs.options + ).then((response) => { + if (response.status >= 200 && response.status < 300) { + return response.json() + } else { + throw response + } + }) + } + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + distributedWaittimeGet( + run: string, + worker: string, + span: string, + options?: any + ): (fetch?: FetchAPI, basePath?: string) => Promise { + const localVarFetchArgs = DefaultApiFetchParamCreator( + configuration + ).distributedWaittimeGet(run, worker, span, options) + return ( + fetch: FetchAPI = portableFetch, + basePath: string = BASE_PATH + ) => { + return fetch( + basePath + localVarFetchArgs.url, + localVarFetchArgs.options + ).then((response) => { + if (response.status >= 200 && response.status < 300) { + return response.json() + } else { + throw response + } + }) + } + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {string} group_by Group By + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + kernelGet( + run: string, + worker: string, + span: string, + group_by: string, + options?: any + ): (fetch?: FetchAPI, basePath?: string) => Promise { + const localVarFetchArgs = DefaultApiFetchParamCreator( + configuration + ).kernelGet(run, worker, span, group_by, options) + return ( + fetch: FetchAPI = portableFetch, + basePath: string = BASE_PATH + ) => { + return fetch( + basePath + localVarFetchArgs.url, + localVarFetchArgs.options + ).then((response) => { + if (response.status >= 200 && response.status < 300) { + return response.json() + } else { + throw response + } + }) + } + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {string} [group_by] Group By + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + kernelTableGet( + run: string, + worker: string, + span: string, + group_by?: string, + options?: any + ): (fetch?: FetchAPI, basePath?: string) => Promise { + const localVarFetchArgs = DefaultApiFetchParamCreator( + configuration + ).kernelTableGet(run, worker, span, group_by, options) + return ( + fetch: FetchAPI = portableFetch, + basePath: string = BASE_PATH + ) => { + return fetch( + basePath + localVarFetchArgs.url, + localVarFetchArgs.options + ).then((response) => { + if (response.status >= 200 && response.status < 300) { + return response.json() + } else { + throw response + } + }) + } + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + kernelTcPieGet( + run: string, + worker: string, + span: string, + options?: any + ): (fetch?: FetchAPI, basePath?: string) => Promise { + const localVarFetchArgs = DefaultApiFetchParamCreator( + configuration + ).kernelTcPieGet(run, worker, span, options) + return ( + fetch: FetchAPI = portableFetch, + basePath: string = BASE_PATH + ) => { + return fetch( + basePath + localVarFetchArgs.url, + localVarFetchArgs.options + ).then((response) => { + if (response.status >= 200 && response.status < 300) { + return response.json() + } else { + throw response + } + }) + } + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + memoryCurveGet( + run: string, + worker: string, + span: string, + options?: any + ): (fetch?: FetchAPI, basePath?: string) => Promise { + const localVarFetchArgs = DefaultApiFetchParamCreator( + configuration + ).memoryCurveGet(run, worker, span, options) + return ( + fetch: FetchAPI = portableFetch, + basePath: string = BASE_PATH + ) => { + return fetch( + basePath + localVarFetchArgs.url, + localVarFetchArgs.options + ).then((response) => { + if (response.status >= 200 && response.status < 300) { + return response.json() + } else { + throw response + } + }) + } + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {number} [start_ts] + * @param {number} [end_ts] + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + memoryEventsGet( + run: string, + worker: string, + span: string, + start_ts?: number, + end_ts?: number, + options?: any + ): (fetch?: FetchAPI, basePath?: string) => Promise { + const localVarFetchArgs = DefaultApiFetchParamCreator( + configuration + ).memoryEventsGet(run, worker, span, start_ts, end_ts, options) + return ( + fetch: FetchAPI = portableFetch, + basePath: string = BASE_PATH + ) => { + return fetch( + basePath + localVarFetchArgs.url, + localVarFetchArgs.options + ).then((response) => { + if (response.status >= 200 && response.status < 300) { + return response.json() + } else { + throw response + } + }) + } + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {number} [start_ts] + * @param {number} [end_ts] + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + memoryGet( + run: string, + worker: string, + span: string, + start_ts?: number, + end_ts?: number, + options?: any + ): (fetch?: FetchAPI, basePath?: string) => Promise { + const localVarFetchArgs = DefaultApiFetchParamCreator( + configuration + ).memoryGet(run, worker, span, start_ts, end_ts, options) + return ( + fetch: FetchAPI = portableFetch, + basePath: string = BASE_PATH + ) => { + return fetch( + basePath + localVarFetchArgs.url, + localVarFetchArgs.options + ).then((response) => { + if (response.status >= 200 && response.status < 300) { + return response.json() + } else { + throw response + } + }) + } + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + moduleGet( + run: string, + worker: string, + span: string, + options?: any + ): (fetch?: FetchAPI, basePath?: string) => Promise { + const localVarFetchArgs = DefaultApiFetchParamCreator( + configuration + ).moduleGet(run, worker, span, options) + return ( + fetch: FetchAPI = portableFetch, + basePath: string = BASE_PATH + ) => { + return fetch( + basePath + localVarFetchArgs.url, + localVarFetchArgs.options + ).then((response) => { + if (response.status >= 200 && response.status < 300) { + return response.json() + } else { + throw response + } + }) + } + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {string} group_by Group By + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + operationGet( + run: string, + worker: string, + span: string, + group_by: string, + options?: any + ): (fetch?: FetchAPI, basePath?: string) => Promise { + const localVarFetchArgs = DefaultApiFetchParamCreator( + configuration + ).operationGet(run, worker, span, group_by, options) + return ( + fetch: FetchAPI = portableFetch, + basePath: string = BASE_PATH + ) => { + return fetch( + basePath + localVarFetchArgs.url, + localVarFetchArgs.options + ).then((response) => { + if (response.status >= 200 && response.status < 300) { + return response.json() + } else { + throw response + } + }) + } + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {string} group_by Group By + * @param {string} op_name + * @param {string} [input_shape] + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + operationStackGet( + run: string, + worker: string, + span: string, + group_by: string, + op_name: string, + input_shape?: string, + options?: any + ): (fetch?: FetchAPI, basePath?: string) => Promise { + const localVarFetchArgs = DefaultApiFetchParamCreator( + configuration + ).operationStackGet( + run, + worker, + span, + group_by, + op_name, + input_shape, + options + ) + return ( + fetch: FetchAPI = portableFetch, + basePath: string = BASE_PATH + ) => { + return fetch( + basePath + localVarFetchArgs.url, + localVarFetchArgs.options + ).then((response) => { + if (response.status >= 200 && response.status < 300) { + return response.json() + } else { + throw response + } + }) + } + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {string} group_by Group By + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + operationTableGet( + run: string, + worker: string, + span: string, + group_by: string, + options?: any + ): (fetch?: FetchAPI, basePath?: string) => Promise { + const localVarFetchArgs = DefaultApiFetchParamCreator( + configuration + ).operationTableGet(run, worker, span, group_by, options) + return ( + fetch: FetchAPI = portableFetch, + basePath: string = BASE_PATH + ) => { + return fetch( + basePath + localVarFetchArgs.url, + localVarFetchArgs.options + ).then((response) => { + if (response.status >= 200 && response.status < 300) { + return response.json() + } else { + throw response + } + }) + } + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + overviewGet( + run: string, + worker: string, + span: string, + options?: any + ): (fetch?: FetchAPI, basePath?: string) => Promise { + const localVarFetchArgs = DefaultApiFetchParamCreator( + configuration + ).overviewGet(run, worker, span, options) + return ( + fetch: FetchAPI = portableFetch, + basePath: string = BASE_PATH + ) => { + return fetch( + basePath + localVarFetchArgs.url, + localVarFetchArgs.options + ).then((response) => { + if (response.status >= 200 && response.status < 300) { + return response.json() + } else { + throw response + } + }) + } + }, + /** + * + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + runsGet( + options?: any + ): (fetch?: FetchAPI, basePath?: string) => Promise { + const localVarFetchArgs = DefaultApiFetchParamCreator( + configuration + ).runsGet(options) + return ( + fetch: FetchAPI = portableFetch, + basePath: string = BASE_PATH + ) => { + return fetch( + basePath + localVarFetchArgs.url, + localVarFetchArgs.options + ).then((response) => { + if (response.status >= 200 && response.status < 300) { + return response.json() + } else { + throw response + } + }) + } + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + spansGet( + run: string, + worker: string, + options?: any + ): (fetch?: FetchAPI, basePath?: string) => Promise> { + const localVarFetchArgs = DefaultApiFetchParamCreator( + configuration + ).spansGet(run, worker, options) + return ( + fetch: FetchAPI = portableFetch, + basePath: string = BASE_PATH + ) => { + return fetch( + basePath + localVarFetchArgs.url, + localVarFetchArgs.options + ).then((response) => { + if (response.status >= 200 && response.status < 300) { + return response.json() + } else { + throw response + } + }) + } + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + traceGet( + run: string, + worker: string, + span: string, + options?: any + ): (fetch?: FetchAPI, basePath?: string) => Promise { + const localVarFetchArgs = DefaultApiFetchParamCreator( + configuration + ).traceGet(run, worker, span, options) + return ( + fetch: FetchAPI = portableFetch, + basePath: string = BASE_PATH + ) => { + return fetch( + basePath + localVarFetchArgs.url, + localVarFetchArgs.options + ).then((response) => { + if (response.status >= 200 && response.status < 300) { + return response.json() + } else { + throw response + } + }) + } + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + treeGet( + run: string, + worker: string, + span: string, + options?: any + ): (fetch?: FetchAPI, basePath?: string) => Promise { + const localVarFetchArgs = DefaultApiFetchParamCreator( + configuration + ).treeGet(run, worker, span, options) + return ( + fetch: FetchAPI = portableFetch, + basePath: string = BASE_PATH + ) => { + return fetch( + basePath + localVarFetchArgs.url, + localVarFetchArgs.options + ).then((response) => { + if (response.status >= 200 && response.status < 300) { + return response.json() + } else { + throw response + } + }) + } + }, + /** + * + * @param {string} run + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + viewsGet( + run: string, + options?: any + ): (fetch?: FetchAPI, basePath?: string) => Promise> { + const localVarFetchArgs = DefaultApiFetchParamCreator( + configuration + ).viewsGet(run, options) + return ( + fetch: FetchAPI = portableFetch, + basePath: string = BASE_PATH + ) => { + return fetch( + basePath + localVarFetchArgs.url, + localVarFetchArgs.options + ).then((response) => { + if (response.status >= 200 && response.status < 300) { + return response.json() + } else { + throw response + } + }) + } + }, + /** + * + * @param {string} run + * @param {string} view + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + workersGet( + run: string, + view: string, + options?: any + ): (fetch?: FetchAPI, basePath?: string) => Promise> { + const localVarFetchArgs = DefaultApiFetchParamCreator( + configuration + ).workersGet(run, view, options) + return ( + fetch: FetchAPI = portableFetch, + basePath: string = BASE_PATH + ) => { + return fetch( + basePath + localVarFetchArgs.url, + localVarFetchArgs.options + ).then((response) => { + if (response.status >= 200 && response.status < 300) { + return response.json() + } else { + throw response + } + }) + } + } + } +} + +/** + * DefaultApi - factory interface + * @export + */ +export const DefaultApiFactory = function ( + configuration?: Configuration, + fetch?: FetchAPI, + basePath?: string +) { + return { + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {string} exp_run + * @param {string} exp_worker + * @param {string} exp_span + * @param {string} [path] + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + diffnodeGet( + run: string, + worker: string, + span: string, + exp_run: string, + exp_worker: string, + exp_span: string, + path?: string, + options?: any + ) { + return DefaultApiFp(configuration).diffnodeGet( + run, + worker, + span, + exp_run, + exp_worker, + exp_span, + path, + options + )(fetch, basePath) + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + distributedCommopsGet( + run: string, + worker: string, + span: string, + options?: any + ) { + return DefaultApiFp(configuration).distributedCommopsGet( + run, + worker, + span, + options + )(fetch, basePath) + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + distributedGpuinfoGet( + run: string, + worker: string, + span: string, + options?: any + ) { + return DefaultApiFp(configuration).distributedGpuinfoGet( + run, + worker, + span, + options + )(fetch, basePath) + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + distributedOverlapGet( + run: string, + worker: string, + span: string, + options?: any + ) { + return DefaultApiFp(configuration).distributedOverlapGet( + run, + worker, + span, + options + )(fetch, basePath) + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + distributedWaittimeGet( + run: string, + worker: string, + span: string, + options?: any + ) { + return DefaultApiFp(configuration).distributedWaittimeGet( + run, + worker, + span, + options + )(fetch, basePath) + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {string} group_by Group By + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + kernelGet( + run: string, + worker: string, + span: string, + group_by: string, + options?: any + ) { + return DefaultApiFp(configuration).kernelGet( + run, + worker, + span, + group_by, + options + )(fetch, basePath) + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {string} [group_by] Group By + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + kernelTableGet( + run: string, + worker: string, + span: string, + group_by?: string, + options?: any + ) { + return DefaultApiFp(configuration).kernelTableGet( + run, + worker, + span, + group_by, + options + )(fetch, basePath) + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + kernelTcPieGet(run: string, worker: string, span: string, options?: any) { + return DefaultApiFp(configuration).kernelTcPieGet( + run, + worker, + span, + options + )(fetch, basePath) + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + memoryCurveGet(run: string, worker: string, span: string, options?: any) { + return DefaultApiFp(configuration).memoryCurveGet( + run, + worker, + span, + options + )(fetch, basePath) + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {number} [start_ts] + * @param {number} [end_ts] + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + memoryEventsGet( + run: string, + worker: string, + span: string, + start_ts?: number, + end_ts?: number, + options?: any + ) { + return DefaultApiFp(configuration).memoryEventsGet( + run, + worker, + span, + start_ts, + end_ts, + options + )(fetch, basePath) + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {number} [start_ts] + * @param {number} [end_ts] + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + memoryGet( + run: string, + worker: string, + span: string, + start_ts?: number, + end_ts?: number, + options?: any + ) { + return DefaultApiFp(configuration).memoryGet( + run, + worker, + span, + start_ts, + end_ts, + options + )(fetch, basePath) + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + moduleGet(run: string, worker: string, span: string, options?: any) { + return DefaultApiFp(configuration).moduleGet( + run, + worker, + span, + options + )(fetch, basePath) + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {string} group_by Group By + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + operationGet( + run: string, + worker: string, + span: string, + group_by: string, + options?: any + ) { + return DefaultApiFp(configuration).operationGet( + run, + worker, + span, + group_by, + options + )(fetch, basePath) + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {string} group_by Group By + * @param {string} op_name + * @param {string} [input_shape] + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + operationStackGet( + run: string, + worker: string, + span: string, + group_by: string, + op_name: string, + input_shape?: string, + options?: any + ) { + return DefaultApiFp(configuration).operationStackGet( + run, + worker, + span, + group_by, + op_name, + input_shape, + options + )(fetch, basePath) + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {string} group_by Group By + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + operationTableGet( + run: string, + worker: string, + span: string, + group_by: string, + options?: any + ) { + return DefaultApiFp(configuration).operationTableGet( + run, + worker, + span, + group_by, + options + )(fetch, basePath) + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + overviewGet(run: string, worker: string, span: string, options?: any) { + return DefaultApiFp(configuration).overviewGet( + run, + worker, + span, + options + )(fetch, basePath) + }, + /** + * + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + runsGet(options?: any) { + return DefaultApiFp(configuration).runsGet(options)(fetch, basePath) + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + spansGet(run: string, worker: string, options?: any) { + return DefaultApiFp(configuration).spansGet( + run, + worker, + options + )(fetch, basePath) + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + traceGet(run: string, worker: string, span: string, options?: any) { + return DefaultApiFp(configuration).traceGet( + run, + worker, + span, + options + )(fetch, basePath) + }, + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + treeGet(run: string, worker: string, span: string, options?: any) { + return DefaultApiFp(configuration).treeGet( + run, + worker, + span, + options + )(fetch, basePath) + }, + /** + * + * @param {string} run + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + viewsGet(run: string, options?: any) { + return DefaultApiFp(configuration).viewsGet(run, options)(fetch, basePath) + }, + /** + * + * @param {string} run + * @param {string} view + * @param {*} [options] Override http request option. + * @throws {RequiredError} + */ + workersGet(run: string, view: string, options?: any) { + return DefaultApiFp(configuration).workersGet( + run, + view, + options + )(fetch, basePath) + } + } +} + +/** + * DefaultApi - object-oriented interface + * @export + * @class DefaultApi + * @extends {BaseAPI} + */ +export class DefaultApi extends BaseAPI { + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {string} exp_run + * @param {string} exp_worker + * @param {string} exp_span + * @param {string} [path] + * @param {*} [options] Override http request option. + * @throws {RequiredError} + * @memberof DefaultApi + */ + public diffnodeGet( + run: string, + worker: string, + span: string, + exp_run: string, + exp_worker: string, + exp_span: string, + path?: string, + options?: any + ) { + return DefaultApiFp(this.configuration).diffnodeGet( + run, + worker, + span, + exp_run, + exp_worker, + exp_span, + path, + options + )(this.fetch, this.basePath) + } + + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {*} [options] Override http request option. + * @throws {RequiredError} + * @memberof DefaultApi + */ + public distributedCommopsGet( + run: string, + worker: string, + span: string, + options?: any + ) { + return DefaultApiFp(this.configuration).distributedCommopsGet( + run, + worker, + span, + options + )(this.fetch, this.basePath) + } + + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {*} [options] Override http request option. + * @throws {RequiredError} + * @memberof DefaultApi + */ + public distributedGpuinfoGet( + run: string, + worker: string, + span: string, + options?: any + ) { + return DefaultApiFp(this.configuration).distributedGpuinfoGet( + run, + worker, + span, + options + )(this.fetch, this.basePath) + } + + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {*} [options] Override http request option. + * @throws {RequiredError} + * @memberof DefaultApi + */ + public distributedOverlapGet( + run: string, + worker: string, + span: string, + options?: any + ) { + return DefaultApiFp(this.configuration).distributedOverlapGet( + run, + worker, + span, + options + )(this.fetch, this.basePath) + } + + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {*} [options] Override http request option. + * @throws {RequiredError} + * @memberof DefaultApi + */ + public distributedWaittimeGet( + run: string, + worker: string, + span: string, + options?: any + ) { + return DefaultApiFp(this.configuration).distributedWaittimeGet( + run, + worker, + span, + options + )(this.fetch, this.basePath) + } + + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {string} group_by Group By + * @param {*} [options] Override http request option. + * @throws {RequiredError} + * @memberof DefaultApi + */ + public kernelGet( + run: string, + worker: string, + span: string, + group_by: string, + options?: any + ) { + return DefaultApiFp(this.configuration).kernelGet( + run, + worker, + span, + group_by, + options + )(this.fetch, this.basePath) + } + + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {string} [group_by] Group By + * @param {*} [options] Override http request option. + * @throws {RequiredError} + * @memberof DefaultApi + */ + public kernelTableGet( + run: string, + worker: string, + span: string, + group_by?: string, + options?: any + ) { + return DefaultApiFp(this.configuration).kernelTableGet( + run, + worker, + span, + group_by, + options + )(this.fetch, this.basePath) + } + + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {*} [options] Override http request option. + * @throws {RequiredError} + * @memberof DefaultApi + */ + public kernelTcPieGet( + run: string, + worker: string, + span: string, + options?: any + ) { + return DefaultApiFp(this.configuration).kernelTcPieGet( + run, + worker, + span, + options + )(this.fetch, this.basePath) + } + + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {*} [options] Override http request option. + * @throws {RequiredError} + * @memberof DefaultApi + */ + public memoryCurveGet( + run: string, + worker: string, + span: string, + options?: any + ) { + return DefaultApiFp(this.configuration).memoryCurveGet( + run, + worker, + span, + options + )(this.fetch, this.basePath) + } + + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {number} [start_ts] + * @param {number} [end_ts] + * @param {*} [options] Override http request option. + * @throws {RequiredError} + * @memberof DefaultApi + */ + public memoryEventsGet( + run: string, + worker: string, + span: string, + start_ts?: number, + end_ts?: number, + options?: any + ) { + return DefaultApiFp(this.configuration).memoryEventsGet( + run, + worker, + span, + start_ts, + end_ts, + options + )(this.fetch, this.basePath) + } + + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {number} [start_ts] + * @param {number} [end_ts] + * @param {*} [options] Override http request option. + * @throws {RequiredError} + * @memberof DefaultApi + */ + public memoryGet( + run: string, + worker: string, + span: string, + start_ts?: number, + end_ts?: number, + options?: any + ) { + return DefaultApiFp(this.configuration).memoryGet( + run, + worker, + span, + start_ts, + end_ts, + options + )(this.fetch, this.basePath) + } + + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {*} [options] Override http request option. + * @throws {RequiredError} + * @memberof DefaultApi + */ + public moduleGet(run: string, worker: string, span: string, options?: any) { + return DefaultApiFp(this.configuration).moduleGet( + run, + worker, + span, + options + )(this.fetch, this.basePath) + } + + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {string} group_by Group By + * @param {*} [options] Override http request option. + * @throws {RequiredError} + * @memberof DefaultApi + */ + public operationGet( + run: string, + worker: string, + span: string, + group_by: string, + options?: any + ) { + return DefaultApiFp(this.configuration).operationGet( + run, + worker, + span, + group_by, + options + )(this.fetch, this.basePath) + } + + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {string} group_by Group By + * @param {string} op_name + * @param {string} [input_shape] + * @param {*} [options] Override http request option. + * @throws {RequiredError} + * @memberof DefaultApi + */ + public operationStackGet( + run: string, + worker: string, + span: string, + group_by: string, + op_name: string, + input_shape?: string, + options?: any + ) { + return DefaultApiFp(this.configuration).operationStackGet( + run, + worker, + span, + group_by, + op_name, + input_shape, + options + )(this.fetch, this.basePath) + } + + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {string} group_by Group By + * @param {*} [options] Override http request option. + * @throws {RequiredError} + * @memberof DefaultApi + */ + public operationTableGet( + run: string, + worker: string, + span: string, + group_by: string, + options?: any + ) { + return DefaultApiFp(this.configuration).operationTableGet( + run, + worker, + span, + group_by, + options + )(this.fetch, this.basePath) + } + + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {*} [options] Override http request option. + * @throws {RequiredError} + * @memberof DefaultApi + */ + public overviewGet(run: string, worker: string, span: string, options?: any) { + return DefaultApiFp(this.configuration).overviewGet( + run, + worker, + span, + options + )(this.fetch, this.basePath) + } + + /** + * + * @param {*} [options] Override http request option. + * @throws {RequiredError} + * @memberof DefaultApi + */ + public runsGet(options?: any) { + return DefaultApiFp(this.configuration).runsGet(options)( + this.fetch, + this.basePath + ) + } + + /** + * + * @param {string} run + * @param {string} worker + * @param {*} [options] Override http request option. + * @throws {RequiredError} + * @memberof DefaultApi + */ + public spansGet(run: string, worker: string, options?: any) { + return DefaultApiFp(this.configuration).spansGet( + run, + worker, + options + )(this.fetch, this.basePath) + } + + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {*} [options] Override http request option. + * @throws {RequiredError} + * @memberof DefaultApi + */ + public traceGet(run: string, worker: string, span: string, options?: any) { + return DefaultApiFp(this.configuration).traceGet( + run, + worker, + span, + options + )(this.fetch, this.basePath) + } + + /** + * + * @param {string} run + * @param {string} worker + * @param {string} span + * @param {*} [options] Override http request option. + * @throws {RequiredError} + * @memberof DefaultApi + */ + public treeGet(run: string, worker: string, span: string, options?: any) { + return DefaultApiFp(this.configuration).treeGet( + run, + worker, + span, + options + )(this.fetch, this.basePath) + } + + /** + * + * @param {string} run + * @param {*} [options] Override http request option. + * @throws {RequiredError} + * @memberof DefaultApi + */ + public viewsGet(run: string, options?: any) { + return DefaultApiFp(this.configuration).viewsGet(run, options)( + this.fetch, + this.basePath + ) + } + + /** + * + * @param {string} run + * @param {string} view + * @param {*} [options] Override http request option. + * @throws {RequiredError} + * @memberof DefaultApi + */ + public workersGet(run: string, view: string, options?: any) { + return DefaultApiFp(this.configuration).workersGet( + run, + view, + options + )(this.fetch, this.basePath) + } +} diff --git a/tb_plugins/profiling/tb_plugin/fe/src/api/generated/configuration.ts b/tb_plugins/profiling/tb_plugin/fe/src/api/generated/configuration.ts new file mode 100644 index 0000000000000000000000000000000000000000..edec57eed84498fa3dcaa804ada9787b0202066c --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/api/generated/configuration.ts @@ -0,0 +1,69 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +// tslint:disable +/** + * Pytorch profile API + * No description provided (generated by Swagger Codegen https://github.com/swagger-api/swagger-codegen) + * + * OpenAPI spec version: 1.0.0 + * + * + * NOTE: This file is auto generated by the swagger code generator program. + * https://github.com/swagger-api/swagger-codegen.git + * Do not edit the file manually. + */ + +export interface ConfigurationParameters { + apiKey?: string | ((name: string) => string) + username?: string + password?: string + accessToken?: string | ((name: string, scopes?: string[]) => string) + basePath?: string +} + +export class Configuration { + /** + * parameter for apiKey security + * @param name security name + * @memberof Configuration + */ + apiKey?: string | ((name: string) => string) + /** + * parameter for basic security + * + * @type {string} + * @memberof Configuration + */ + username?: string + /** + * parameter for basic security + * + * @type {string} + * @memberof Configuration + */ + password?: string + /** + * parameter for oauth2 security + * @param name security name + * @param scopes oauth2 scope + * @memberof Configuration + */ + accessToken?: string | ((name: string, scopes?: string[]) => string) + /** + * override base path + * + * @type {string} + * @memberof Configuration + */ + basePath?: string + + constructor(param: ConfigurationParameters = {}) { + this.apiKey = param.apiKey + this.username = param.username + this.password = param.password + this.accessToken = param.accessToken + this.basePath = param.basePath + } +} diff --git a/tb_plugins/profiling/tb_plugin/fe/src/api/generated/custom.d.ts b/tb_plugins/profiling/tb_plugin/fe/src/api/generated/custom.d.ts new file mode 100644 index 0000000000000000000000000000000000000000..bfe6a59d9df208845d2fb5a43edb7a2f3d8721ae --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/api/generated/custom.d.ts @@ -0,0 +1,6 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +declare module 'portable-fetch' +declare module 'url' diff --git a/tb_plugins/profiling/tb_plugin/fe/src/api/generated/index.ts b/tb_plugins/profiling/tb_plugin/fe/src/api/generated/index.ts new file mode 100644 index 0000000000000000000000000000000000000000..1ab79fb65f34d7c33099bac7e54378c3f54fdb35 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/api/generated/index.ts @@ -0,0 +1,19 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +// tslint:disable +/** + * Pytorch profile API + * No description provided (generated by Swagger Codegen https://github.com/swagger-api/swagger-codegen) + * + * OpenAPI spec version: 1.0.0 + * + * + * NOTE: This file is auto generated by the swagger code generator program. + * https://github.com/swagger-api/swagger-codegen.git + * Do not edit the file manually. + */ + +export * from './api' +export * from './configuration' diff --git a/tb_plugins/profiling/tb_plugin/fe/src/api/index.ts b/tb_plugins/profiling/tb_plugin/fe/src/api/index.ts new file mode 100644 index 0000000000000000000000000000000000000000..f43336a583b81998422facba8787270d6cee7673 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/api/index.ts @@ -0,0 +1,8 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +import * as api from './generated' + +export const defaultApi = new api.DefaultApi(undefined, undefined, fetch) +export * from './generated/api' diff --git a/tb_plugins/profiling/tb_plugin/fe/src/api/mock.ts b/tb_plugins/profiling/tb_plugin/fe/src/api/mock.ts new file mode 100644 index 0000000000000000000000000000000000000000..744c222a0266eed6359bb60fc0f6ba9601ba8edc --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/api/mock.ts @@ -0,0 +1,6716 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +export class MockAPI { + runsGet() { + return { + runs: ['resnet50_num_workers_0', 'resnet50_num_workers_4'], + loading: false + } + } + + viewsGet(run: string) { + return Promise.resolve([ + 'Overview', + 'Operator', + 'Kernel', + 'Trace', + 'Memory' + ]) + } + + spansGet(run: string, view: String) { + return Promise.resolve(['1', '2']) + } + + workersGet(run: string, view: String) { + return Promise.resolve(['worker0']) + } + + overviewGet(run: string, worker: string, span: string) { + return Promise.resolve({ + steps: { + columns: [ + { type: 'string', name: 'Step' }, + { type: 'number', name: 'Kernel' }, + { type: 'string', role: 'tooltip', p: { html: 'true' } }, + { type: 'number', name: 'Memcpy' }, + { type: 'string', role: 'tooltip', p: { html: 'true' } }, + { type: 'number', name: 'Memset' }, + { type: 'string', role: 'tooltip', p: { html: 'true' } }, + { type: 'number', name: 'Runtime' }, + { type: 'string', role: 'tooltip', p: { html: 'true' } }, + { type: 'number', name: 'DataLoader' }, + { type: 'string', role: 'tooltip', p: { html: 'true' } }, + { type: 'number', name: 'CPU Exec' }, + { type: 'string', role: 'tooltip', p: { html: 'true' } }, + { type: 'number', name: 'Other' }, + { type: 'string', role: 'tooltip', p: { html: 'true' } } + ], + rows: [ + [ + '5', + 98598, + '
Step 5
Total: 187948us
Kernel: 98598us
Percentage: 52.46%
', + 1941, + '
Step 5
Total: 187948us
Memcpy: 1941us
Percentage: 1.03%
', + 90, + '
Step 5
Total: 187948us
Memset: 90us
Percentage: 0.05%
', + 2796, + '
Step 5
Total: 187948us
Runtime: 2796us
Percentage: 1.49%
', + 69317, + '
Step 5
Total: 187948us
DataLoader: 69317us
Percentage: 36.88%
', + 14091, + '
Step 5
Total: 187948us
CPU Exec: 14091us
Percentage: 7.5%
', + 1115, + '
Step 5
Total: 187948us
Other: 1115us
Percentage: 0.59%
' + ], + [ + '6', + 98570, + '
Step 6
Total: 175153us
Kernel: 98570us
Percentage: 56.28%
', + 1947, + '
Step 6
Total: 175153us
Memcpy: 1947us
Percentage: 1.11%
', + 89, + '
Step 6
Total: 175153us
Memset: 89us
Percentage: 0.05%
', + 2762, + '
Step 6
Total: 175153us
Runtime: 2762us
Percentage: 1.58%
', + 57669, + '
Step 6
Total: 175153us
DataLoader: 57669us
Percentage: 32.92%
', + 12968, + '
Step 6
Total: 175153us
CPU Exec: 12968us
Percentage: 7.4%
', + 1148, + '
Step 6
Total: 175153us
Other: 1148us
Percentage: 0.66%
' + ], + [ + '7', + 98596, + '
Step 7
Total: 179733us
Kernel: 98596us
Percentage: 54.86%
', + 1931, + '
Step 7
Total: 179733us
Memcpy: 1931us
Percentage: 1.07%
', + 91, + '
Step 7
Total: 179733us
Memset: 91us
Percentage: 0.05%
', + 2877, + '
Step 7
Total: 179733us
Runtime: 2877us
Percentage: 1.6%
', + 61257, + '
Step 7
Total: 179733us
DataLoader: 61257us
Percentage: 34.08%
', + 13768, + '
Step 7
Total: 179733us
CPU Exec: 13768us
Percentage: 7.66%
', + 1213, + '
Step 7
Total: 179733us
Other: 1213us
Percentage: 0.67%
' + ], + [ + '8', + 98623, + '
Step 8
Total: 174564us
Kernel: 98623us
Percentage: 56.5%
', + 1938, + '
Step 8
Total: 174564us
Memcpy: 1938us
Percentage: 1.11%
', + 89, + '
Step 8
Total: 174564us
Memset: 89us
Percentage: 0.05%
', + 2841, + '
Step 8
Total: 174564us
Runtime: 2841us
Percentage: 1.63%
', + 56453, + '
Step 8
Total: 174564us
DataLoader: 56453us
Percentage: 32.34%
', + 13420, + '
Step 8
Total: 174564us
CPU Exec: 13420us
Percentage: 7.69%
', + 1200, + '
Step 8
Total: 174564us
Other: 1200us
Percentage: 0.69%
' + ], + [ + '9', + 98504, + '
Step 9
Total: 182172us
Kernel: 98504us
Percentage: 54.07%
', + 1937, + '
Step 9
Total: 182172us
Memcpy: 1937us
Percentage: 1.06%
', + 87, + '
Step 9
Total: 182172us
Memset: 87us
Percentage: 0.05%
', + 2788, + '
Step 9
Total: 182172us
Runtime: 2788us
Percentage: 1.53%
', + 62690, + '
Step 9
Total: 182172us
DataLoader: 62690us
Percentage: 34.41%
', + 15025, + '
Step 9
Total: 182172us
CPU Exec: 15025us
Percentage: 8.25%
', + 1141, + '
Step 9
Total: 182172us
Other: 1141us
Percentage: 0.63%
' + ], + [ + '10', + 98641, + '
Step 10
Total: 165983us
Kernel: 98641us
Percentage: 59.43%
', + 1798, + '
Step 10
Total: 165983us
Memcpy: 1798us
Percentage: 1.08%
', + 88, + '
Step 10
Total: 165983us
Memset: 88us
Percentage: 0.05%
', + 3381, + '
Step 10
Total: 165983us
Runtime: 3381us
Percentage: 2.04%
', + 48185, + '
Step 10
Total: 165983us
DataLoader: 48185us
Percentage: 29.03%
', + 12773, + '
Step 10
Total: 165983us
CPU Exec: 12773us
Percentage: 7.7%
', + 1117, + '
Step 10
Total: 165983us
Other: 1117us
Percentage: 0.67%
' + ] + ] + }, + performance: [ + { + name: 'Average Step Time', + description: '', + value: 177592, + extra: 100, + children: [ + { name: 'Kernel', description: '', value: 98589, extra: 55.51 }, + { name: 'Memcpy', description: '', value: 1915, extra: 1.08 }, + { name: 'Memset', description: '', value: 89, extra: 0.05 }, + { name: 'Runtime', description: '', value: 2908, extra: 1.64 }, + { name: 'DataLoader', description: '', value: 59262, extra: 33.37 }, + { name: 'CPU Exec', description: '', value: 13674, extra: 7.7 }, + { name: 'Other', description: '', value: 1156, extra: 0.65 } + ] + } + ], + recommendations: + '
  • This run has high time cost on input data loading. 33.4% of the step time is in DataLoader. You could try to set num_workers on DataLoader\'s construction and enable multi-processes on data loading.
  • Kernels with 68% time are launched by Tensor Cores eligible operators. You could enable Automatic Mixed Precision to speedup by using FP16.
', + environments: [ + { title: 'Number of Worker(s)', value: '1' }, + { title: 'Device Type', value: 'GPU' } + ], + gpu_metrics: { + title: 'GPU Summary', + data: [ + { title: 'GPU 0:', value: '' }, + { title: 'Name', value: 'Tesla V100-DGXS-32GB' }, + { title: 'Memory', value: '31.74 GB' }, + { title: 'Compute Capability', value: '7.0' }, + { title: 'GPU Utilization', value: '55.51 %' }, + { title: 'Est. SM Efficiency', value: '54.68 %' }, + { title: 'Est. Achieved Occupancy', value: '49.13 %' }, + { title: 'Kernel Time using Tensor Cores', value: '0.0 %' } + ], + tooltip: + "The GPU usage metrics:\n\nGPU Utilization:\nGPU busy time / All steps time. The higher, the better. GPU busy time is the time during which there is at least one GPU kernel running on it. All steps time is the total time of all profiler steps(or called as iterations).\n\nEst. SM Efficiency:\nEstimated Stream Multiprocessor Efficiency. The higher, the better. This metric of a kernel, SM_Eff_K = min(blocks of this kernel / SM number of this GPU, 100%). This overall number is the sum of all kernels' SM_Eff_K weighted by kernel's execution duration, divided by all steps time.\n\nEst. Achieved Occupancy:\nFor most cases such as memory bandwidth bounded kernels, the higher the better. Occupancy is the ratio of active warps on an SM to the maximum number of active warps supported by the SM. The theoretical occupancy of a kernel is upper limit occupancy of this kernel, limited by multiple factors such as kernel shape, kernel used resource, and the GPU compute capability.\nEst. Achieved Occupancy of a kernel, OCC_K = min(threads of the kernel / SM number / max threads per SM, theoretical occupancy of the kernel). This overall number is the weighted average of all kernels' OCC_K using kernel's execution duration as weight. It shows fine-grained low-level GPU utilization.\n\nKernel using Tensor Cores:\nTotal GPU Time for Tensor Core kernels / Total GPU Time for all kernels.\n" + } + }) + } + + diffnodeGet( + run: string, + worker: string, + span: string, + exp_run: string, + exp_worker: string, + exp_span: string, + path?: string + ) { + return Promise.resolve({ + left: { + name: 'multiple nodes', + duration: 4246748, + device_duration: 376761, + total_duration: 3823182, + aggs: [ + { + name: 'aten::empty', + calls: 4214, + host_duration: 186312, + device_duration: 0, + self_host_duration: 186312, + self_device_duration: 0 + }, + { + name: 'aten::zero_', + calls: 846, + host_duration: 31902, + device_duration: 736, + self_host_duration: 17460, + self_device_duration: 0 + }, + { + name: 'aten::zeros', + calls: 520, + host_duration: 62713, + device_duration: 0, + self_host_duration: 32640, + self_device_duration: 0 + }, + { + name: 'aten::to', + calls: 2696, + host_duration: 1711486, + device_duration: 8796, + self_host_duration: 37162, + self_device_duration: 0 + }, + { + name: 'detach', + calls: 256, + host_duration: 4379, + device_duration: 0, + self_host_duration: 4379, + self_device_duration: 0 + }, + { + name: 'aten::detach', + calls: 256, + host_duration: 10596, + device_duration: 0, + self_host_duration: 6217, + self_device_duration: 0 + }, + { + name: 'aten::as_strided', + calls: 914, + host_duration: 8470, + device_duration: 0, + self_host_duration: 8470, + self_device_duration: 0 + }, + { + name: 'aten::unsqueeze', + calls: 384, + host_duration: 19150, + device_duration: 0, + self_host_duration: 16142, + self_device_duration: 0 + }, + { + name: 'aten::empty_strided', + calls: 1158, + host_duration: 50043, + device_duration: 0, + self_host_duration: 50043, + self_device_duration: 0 + }, + { + name: 'aten::copy_', + calls: 1412, + host_duration: 1518205, + device_duration: 8796, + self_host_duration: 1509009, + self_device_duration: 8796 + }, + { + name: 'aten::_to_copy', + calls: 1284, + host_duration: 1674324, + device_duration: 8796, + self_host_duration: 104788, + self_device_duration: 0 + }, + { + name: 'aten::upsample_bilinear2d', + calls: 128, + host_duration: 460479, + device_duration: 0, + self_host_duration: 421547, + self_device_duration: 0 + }, + { + name: 'aten::squeeze', + calls: 128, + host_duration: 9401, + device_duration: 0, + self_host_duration: 8211, + self_device_duration: 0 + }, + { + name: 'aten::round', + calls: 128, + host_duration: 31311, + device_duration: 0, + self_host_duration: 31311, + self_device_duration: 0 + }, + { + name: 'aten::slice', + calls: 260, + host_duration: 17762, + device_duration: 0, + self_host_duration: 15082, + self_device_duration: 0 + }, + { + name: 'detach_', + calls: 512, + host_duration: 4194, + device_duration: 0, + self_host_duration: 4194, + self_device_duration: 0 + }, + { + name: 'aten::detach_', + calls: 512, + host_duration: 14514, + device_duration: 0, + self_host_duration: 10320, + self_device_duration: 0 + }, + { + name: 'aten::result_type', + calls: 640, + host_duration: 1734, + device_duration: 0, + self_host_duration: 1734, + self_device_duration: 0 + }, + { + name: 'aten::pow', + calls: 640, + host_duration: 86249, + device_duration: 0, + self_host_duration: 78373, + self_device_duration: 0 + }, + { + name: 'aten::sub', + calls: 640, + host_duration: 183533, + device_duration: 0, + self_host_duration: 75637, + self_device_duration: 0 + }, + { + name: 'aten::gt', + calls: 640, + host_duration: 71284, + device_duration: 0, + self_host_duration: 49575, + self_device_duration: 0 + }, + { + name: 'aten::_local_scalar_dense', + calls: 768, + host_duration: 4948, + device_duration: 0, + self_host_duration: 4948, + self_device_duration: 0 + }, + { + name: 'aten::item', + calls: 768, + host_duration: 20922, + device_duration: 0, + self_host_duration: 15974, + self_device_duration: 0 + }, + { + name: 'aten::is_nonzero', + calls: 640, + host_duration: 27934, + device_duration: 0, + self_host_duration: 10747, + self_device_duration: 0 + }, + { + name: 'aten::div', + calls: 130, + host_duration: 168214, + device_duration: 75, + self_host_duration: 146203, + self_device_duration: 75 + }, + { + name: 'aten::resize_', + calls: 6, + host_duration: 248, + device_duration: 0, + self_host_duration: 248, + self_device_duration: 0 + }, + { + name: 'aten::narrow', + calls: 4, + host_duration: 280, + device_duration: 0, + self_host_duration: 99, + self_device_duration: 0 + }, + { + name: 'aten::_cat', + calls: 4, + host_duration: 92993, + device_duration: 0, + self_host_duration: 92405, + self_device_duration: 0 + }, + { + name: 'aten::cat', + calls: 4, + host_duration: 93282, + device_duration: 0, + self_host_duration: 289, + self_device_duration: 0 + }, + { + name: 'aten::stack', + calls: 4, + host_duration: 124757, + device_duration: 0, + self_host_duration: 22050, + self_device_duration: 0 + }, + { + name: 'aten::cudnn_convolution', + calls: 106, + host_duration: 44043, + device_duration: 71832, + self_host_duration: 35027, + self_device_duration: 71832 + }, + { + name: 'aten::_convolution', + calls: 106, + host_duration: 51312, + device_duration: 71832, + self_host_duration: 7269, + self_device_duration: 0 + }, + { + name: 'aten::convolution', + calls: 106, + host_duration: 55287, + device_duration: 71832, + self_host_duration: 3975, + self_device_duration: 0 + }, + { + name: 'aten::conv2d', + calls: 106, + host_duration: 59323, + device_duration: 71832, + self_host_duration: 4036, + self_device_duration: 0 + }, + { + name: 'aten::add', + calls: 138, + host_duration: 17461, + device_duration: 10540, + self_host_duration: 15188, + self_device_duration: 10540 + }, + { + name: 'aten::empty_like', + calls: 108, + host_duration: 11504, + device_duration: 0, + self_host_duration: 4865, + self_device_duration: 0 + }, + { + name: 'aten::view', + calls: 214, + host_duration: 3589, + device_duration: 0, + self_host_duration: 3589, + self_device_duration: 0 + }, + { + name: 'aten::cudnn_batch_norm', + calls: 106, + host_duration: 71328, + device_duration: 25802, + self_host_duration: 40944, + self_device_duration: 25802 + }, + { + name: 'aten::_batch_norm_impl_index', + calls: 106, + host_duration: 76354, + device_duration: 25802, + self_host_duration: 5026, + self_device_duration: 0 + }, + { + name: 'aten::batch_norm', + calls: 106, + host_duration: 79832, + device_duration: 25802, + self_host_duration: 3478, + self_device_duration: 0 + }, + { + name: 'aten::clamp_min', + calls: 98, + host_duration: 5417, + device_duration: 12000, + self_host_duration: 3885, + self_device_duration: 12000 + }, + { + name: 'aten::clamp_min_', + calls: 98, + host_duration: 8537, + device_duration: 12000, + self_host_duration: 3120, + self_device_duration: 0 + }, + { + name: 'aten::relu_', + calls: 98, + host_duration: 16708, + device_duration: 12000, + self_host_duration: 8171, + self_device_duration: 0 + }, + { + name: 'aten::max_pool2d_with_indices', + calls: 2, + host_duration: 442, + device_duration: 940, + self_host_duration: 405, + self_device_duration: 940 + }, + { + name: 'aten::max_pool2d', + calls: 2, + host_duration: 542, + device_duration: 940, + self_host_duration: 100, + self_device_duration: 0 + }, + { + name: 'aten::add_', + calls: 998, + host_duration: 72931, + device_duration: 13090, + self_host_duration: 57558, + self_device_duration: 13090 + }, + { + name: 'aten::mean', + calls: 2, + host_duration: 376, + device_duration: 133, + self_host_duration: 339, + self_device_duration: 133 + }, + { + name: 'aten::adaptive_avg_pool2d', + calls: 2, + host_duration: 465, + device_duration: 133, + self_host_duration: 89, + self_device_duration: 0 + }, + { + name: 'aten::_reshape_alias', + calls: 4, + host_duration: 170, + device_duration: 0, + self_host_duration: 170, + self_device_duration: 0 + }, + { + name: 'aten::flatten', + calls: 2, + host_duration: 207, + device_duration: 0, + self_host_duration: 103, + self_device_duration: 0 + }, + { + name: 'aten::transpose', + calls: 10, + host_duration: 587, + device_duration: 0, + self_host_duration: 465, + self_device_duration: 0 + }, + { + name: 'aten::t', + calls: 10, + host_duration: 1068, + device_duration: 0, + self_host_duration: 481, + self_device_duration: 0 + }, + { + name: 'aten::expand', + calls: 4, + host_duration: 277, + device_duration: 0, + self_host_duration: 227, + self_device_duration: 0 + }, + { + name: 'aten::addmm', + calls: 2, + host_duration: 809, + device_duration: 84, + self_host_duration: 604, + self_device_duration: 84 + }, + { + name: 'aten::linear', + calls: 2, + host_duration: 1185, + device_duration: 84, + self_host_duration: 137, + self_device_duration: 0 + }, + { + name: 'aten::_log_softmax', + calls: 2, + host_duration: 308, + device_duration: 14, + self_host_duration: 271, + self_device_duration: 14 + }, + { + name: 'aten::log_softmax', + calls: 2, + host_duration: 472, + device_duration: 14, + self_host_duration: 153, + self_device_duration: 0 + }, + { + name: 'aten::nll_loss_forward', + calls: 2, + host_duration: 522, + device_duration: 8, + self_host_duration: 476, + self_device_duration: 8 + }, + { + name: 'aten::nll_loss', + calls: 2, + host_duration: 590, + device_duration: 8, + self_host_duration: 68, + self_device_duration: 0 + }, + { + name: 'aten::nll_loss_nd', + calls: 2, + host_duration: 641, + device_duration: 8, + self_host_duration: 51, + self_device_duration: 0 + }, + { + name: 'aten::cross_entropy_loss', + calls: 2, + host_duration: 1234, + device_duration: 22, + self_host_duration: 121, + self_device_duration: 0 + }, + { + name: 'aten::fill_', + calls: 328, + host_duration: 14541, + device_duration: 738, + self_host_duration: 10083, + self_device_duration: 738 + }, + { + name: 'aten::ones_like', + calls: 2, + host_duration: 516, + device_duration: 2, + self_host_duration: 142, + self_device_duration: 0 + }, + { + name: 'aten::nll_loss_backward', + calls: 2, + host_duration: 573, + device_duration: 8, + self_host_duration: 310, + self_device_duration: 6 + }, + { + name: 'NllLossBackward0', + calls: 2, + host_duration: 774, + device_duration: 8, + self_host_duration: 201, + self_device_duration: 0 + }, + { + name: 'autograd::engine::evaluate_function: NllLossBackward0', + calls: 2, + host_duration: 1025, + device_duration: 8, + self_host_duration: 251, + self_device_duration: 0 + }, + { + name: 'aten::_log_softmax_backward_data', + calls: 2, + host_duration: 236, + device_duration: 18, + self_host_duration: 196, + self_device_duration: 18 + }, + { + name: 'LogSoftmaxBackward0', + calls: 2, + host_duration: 385, + device_duration: 18, + self_host_duration: 149, + self_device_duration: 0 + }, + { + name: 'autograd::engine::evaluate_function: LogSoftmaxBackward0', + calls: 2, + host_duration: 632, + device_duration: 18, + self_host_duration: 247, + self_device_duration: 0 + }, + { + name: 'aten::mm', + calls: 4, + host_duration: 668, + device_duration: 140, + self_host_duration: 547, + self_device_duration: 140 + }, + { + name: 'AddmmBackward0', + calls: 2, + host_duration: 1698, + device_duration: 140, + self_host_duration: 417, + self_device_duration: 0 + }, + { + name: 'aten::sum', + calls: 2, + host_duration: 370, + device_duration: 15, + self_host_duration: 328, + self_device_duration: 15 + }, + { + name: 'autograd::engine::evaluate_function: AddmmBackward0', + calls: 2, + host_duration: 2710, + device_duration: 155, + self_host_duration: 567, + self_device_duration: 0 + }, + { + name: 'torch::autograd::AccumulateGrad', + calls: 322, + host_duration: 41184, + device_duration: 997, + self_host_duration: 16159, + self_device_duration: 0 + }, + { + name: + 'autograd::engine::evaluate_function: torch::autograd::AccumulateGrad', + calls: 322, + host_duration: 70946, + device_duration: 997, + self_host_duration: 29762, + self_device_duration: 0 + }, + { + name: 'TBackward0', + calls: 2, + host_duration: 280, + device_duration: 0, + self_host_duration: 64, + self_device_duration: 0 + }, + { + name: 'autograd::engine::evaluate_function: TBackward0', + calls: 2, + host_duration: 428, + device_duration: 0, + self_host_duration: 148, + self_device_duration: 0 + }, + { + name: 'aten::reshape', + calls: 2, + host_duration: 170, + device_duration: 0, + self_host_duration: 104, + self_device_duration: 0 + }, + { + name: 'ReshapeAliasBackward0', + calls: 2, + host_duration: 264, + device_duration: 0, + self_host_duration: 94, + self_device_duration: 0 + }, + { + name: 'autograd::engine::evaluate_function: ReshapeAliasBackward0', + calls: 2, + host_duration: 402, + device_duration: 0, + self_host_duration: 138, + self_device_duration: 0 + }, + { + name: 'MeanBackward1', + calls: 2, + host_duration: 1036, + device_duration: 75, + self_host_duration: 231, + self_device_duration: 0 + }, + { + name: 'autograd::engine::evaluate_function: MeanBackward1', + calls: 2, + host_duration: 1254, + device_duration: 75, + self_host_duration: 218, + self_device_duration: 0 + }, + { + name: 'aten::threshold_backward', + calls: 98, + host_duration: 13838, + device_duration: 17984, + self_host_duration: 12131, + self_device_duration: 17984 + }, + { + name: 'ReluBackward0', + calls: 98, + host_duration: 21183, + device_duration: 17984, + self_host_duration: 7345, + self_device_duration: 0 + }, + { + name: 'autograd::engine::evaluate_function: ReluBackward0', + calls: 98, + host_duration: 33492, + device_duration: 17984, + self_host_duration: 12309, + self_device_duration: 0 + }, + { + name: 'AddBackward0', + calls: 32, + host_duration: 251, + device_duration: 0, + self_host_duration: 251, + self_device_duration: 0 + }, + { + name: 'autograd::engine::evaluate_function: AddBackward0', + calls: 32, + host_duration: 2579, + device_duration: 0, + self_host_duration: 2328, + self_device_duration: 0 + }, + { + name: 'aten::cudnn_batch_norm_backward', + calls: 106, + host_duration: 62175, + device_duration: 44433, + self_host_duration: 36053, + self_device_duration: 44433 + }, + { + name: 'CudnnBatchNormBackward0', + calls: 106, + host_duration: 69160, + device_duration: 44433, + self_host_duration: 6985, + self_device_duration: 0 + }, + { + name: + 'autograd::engine::evaluate_function: CudnnBatchNormBackward0', + calls: 106, + host_duration: 88613, + device_duration: 44433, + self_host_duration: 19453, + self_device_duration: 0 + }, + { + name: 'aten::cudnn_convolution_backward_input', + calls: 104, + host_duration: 40820, + device_duration: 76620, + self_host_duration: 30768, + self_device_duration: 76620 + }, + { + name: 'aten::cudnn_convolution_backward_weight', + calls: 106, + host_duration: 44875, + device_duration: 90108, + self_host_duration: 27458, + self_device_duration: 90108 + }, + { + name: 'aten::cudnn_convolution_backward', + calls: 106, + host_duration: 101020, + device_duration: 166728, + self_host_duration: 15325, + self_device_duration: 0 + }, + { + name: 'CudnnConvolutionBackward0', + calls: 106, + host_duration: 107964, + device_duration: 166728, + self_host_duration: 6944, + self_device_duration: 0 + }, + { + name: + 'autograd::engine::evaluate_function: CudnnConvolutionBackward0', + calls: 106, + host_duration: 129129, + device_duration: 177161, + self_host_duration: 16746, + self_device_duration: 0 + }, + { + name: 'aten::max_pool2d_with_indices_backward', + calls: 2, + host_duration: 483, + device_duration: 3048, + self_host_duration: 257, + self_device_duration: 2588 + }, + { + name: 'MaxPool2DWithIndicesBackward0', + calls: 2, + host_duration: 599, + device_duration: 3048, + self_host_duration: 116, + self_device_duration: 0 + }, + { + name: + 'autograd::engine::evaluate_function: MaxPool2DWithIndicesBackward0', + calls: 2, + host_duration: 836, + device_duration: 3048, + self_host_duration: 237, + self_device_duration: 0 + }, + { + name: 'aten::mul_', + calls: 322, + host_duration: 23818, + device_duration: 797, + self_host_duration: 19073, + self_device_duration: 797 + } + ] + }, + right: { + name: 'multiple nodes', + duration: 468427, + device_duration: 374211, + total_duration: 644686, + aggs: [ + { + name: 'aten::empty', + calls: 4214, + host_duration: 31594, + device_duration: 0, + self_host_duration: 31594, + self_device_duration: 0 + }, + { + name: 'aten::zero_', + calls: 846, + host_duration: 6010, + device_duration: 864, + self_host_duration: 1910, + self_device_duration: 0 + }, + { + name: 'aten::zeros', + calls: 520, + host_duration: 10338, + device_duration: 0, + self_host_duration: 2951, + self_device_duration: 0 + }, + { + name: 'aten::to', + calls: 2696, + host_duration: 47031, + device_duration: 8684, + self_host_duration: 4258, + self_device_duration: 0 + }, + { + name: 'detach', + calls: 256, + host_duration: 701, + device_duration: 0, + self_host_duration: 698, + self_device_duration: 0 + }, + { + name: 'aten::detach', + calls: 256, + host_duration: 1374, + device_duration: 0, + self_host_duration: 676, + self_device_duration: 0 + }, + { + name: 'aten::as_strided', + calls: 914, + host_duration: 1013, + device_duration: 0, + self_host_duration: 1013, + self_device_duration: 0 + }, + { + name: 'aten::unsqueeze', + calls: 384, + host_duration: 2074, + device_duration: 0, + self_host_duration: 1723, + self_device_duration: 0 + }, + { + name: 'aten::empty_strided', + calls: 1158, + host_duration: 6859, + device_duration: 0, + self_host_duration: 6859, + self_device_duration: 0 + }, + { + name: 'aten::copy_', + calls: 1412, + host_duration: 25248, + device_duration: 8684, + self_host_duration: 16166, + self_device_duration: 8684 + }, + { + name: 'aten::_to_copy', + calls: 1284, + host_duration: 42773, + device_duration: 8684, + self_host_duration: 10227, + self_device_duration: 0 + }, + { + name: 'aten::upsample_bilinear2d', + calls: 128, + host_duration: 51788, + device_duration: 0, + self_host_duration: 46788, + self_device_duration: 0 + }, + { + name: 'aten::squeeze', + calls: 128, + host_duration: 1035, + device_duration: 0, + self_host_duration: 895, + self_device_duration: 0 + }, + { + name: 'aten::round', + calls: 128, + host_duration: 11074, + device_duration: 0, + self_host_duration: 11074, + self_device_duration: 0 + }, + { + name: 'aten::slice', + calls: 260, + host_duration: 1892, + device_duration: 0, + self_host_duration: 1600, + self_device_duration: 0 + }, + { + name: 'detach_', + calls: 512, + host_duration: 278, + device_duration: 0, + self_host_duration: 244, + self_device_duration: 0 + }, + { + name: 'aten::detach_', + calls: 512, + host_duration: 1341, + device_duration: 0, + self_host_duration: 1097, + self_device_duration: 0 + }, + { + name: 'aten::result_type', + calls: 640, + host_duration: 317, + device_duration: 0, + self_host_duration: 317, + self_device_duration: 0 + }, + { + name: 'aten::pow', + calls: 640, + host_duration: 8857, + device_duration: 0, + self_host_duration: 7959, + self_device_duration: 0 + }, + { + name: 'aten::sub', + calls: 640, + host_duration: 17840, + device_duration: 0, + self_host_duration: 7688, + self_device_duration: 0 + }, + { + name: 'aten::gt', + calls: 640, + host_duration: 6903, + device_duration: 0, + self_host_duration: 4901, + self_device_duration: 0 + }, + { + name: 'aten::_local_scalar_dense', + calls: 768, + host_duration: 395, + device_duration: 0, + self_host_duration: 395, + self_device_duration: 0 + }, + { + name: 'aten::item', + calls: 768, + host_duration: 2532, + device_duration: 0, + self_host_duration: 2130, + self_device_duration: 0 + }, + { + name: 'aten::is_nonzero', + calls: 640, + host_duration: 3601, + device_duration: 0, + self_host_duration: 1427, + self_device_duration: 0 + }, + { + name: 'aten::div', + calls: 130, + host_duration: 11707, + device_duration: 75, + self_host_duration: 9531, + self_device_duration: 75 + }, + { + name: 'aten::resize_', + calls: 6, + host_duration: 79, + device_duration: 0, + self_host_duration: 79, + self_device_duration: 0 + }, + { + name: 'aten::narrow', + calls: 4, + host_duration: 37, + device_duration: 0, + self_host_duration: 16, + self_device_duration: 0 + }, + { + name: 'aten::_cat', + calls: 4, + host_duration: 9241, + device_duration: 0, + self_host_duration: 9113, + self_device_duration: 0 + }, + { + name: 'aten::cat', + calls: 4, + host_duration: 9286, + device_duration: 0, + self_host_duration: 45, + self_device_duration: 0 + }, + { + name: 'aten::stack', + calls: 4, + host_duration: 16195, + device_duration: 0, + self_host_duration: 6105, + self_device_duration: 0 + }, + { + name: 'aten::cudnn_convolution', + calls: 106, + host_duration: 17357, + device_duration: 71414, + self_host_duration: 13601, + self_device_duration: 71414 + }, + { + name: 'aten::_convolution', + calls: 106, + host_duration: 18514, + device_duration: 71414, + self_host_duration: 1157, + self_device_duration: 0 + }, + { + name: 'aten::convolution', + calls: 106, + host_duration: 19185, + device_duration: 71414, + self_host_duration: 671, + self_device_duration: 0 + }, + { + name: 'aten::conv2d', + calls: 106, + host_duration: 19750, + device_duration: 71414, + self_host_duration: 565, + self_device_duration: 0 + }, + { + name: 'aten::add', + calls: 138, + host_duration: 4973, + device_duration: 10567, + self_host_duration: 3157, + self_device_duration: 10567 + }, + { + name: 'aten::empty_like', + calls: 108, + host_duration: 1924, + device_duration: 0, + self_host_duration: 598, + self_device_duration: 0 + }, + { + name: 'aten::view', + calls: 214, + host_duration: 596, + device_duration: 0, + self_host_duration: 596, + self_device_duration: 0 + }, + { + name: 'aten::cudnn_batch_norm', + calls: 106, + host_duration: 11083, + device_duration: 25737, + self_host_duration: 5031, + self_device_duration: 25737 + }, + { + name: 'aten::_batch_norm_impl_index', + calls: 106, + host_duration: 11856, + device_duration: 25737, + self_host_duration: 773, + self_device_duration: 0 + }, + { + name: 'aten::batch_norm', + calls: 106, + host_duration: 12386, + device_duration: 25737, + self_host_duration: 530, + self_device_duration: 0 + }, + { + name: 'aten::clamp_min', + calls: 98, + host_duration: 2189, + device_duration: 12010, + self_host_duration: 1030, + self_device_duration: 12010 + }, + { + name: 'aten::clamp_min_', + calls: 98, + host_duration: 2614, + device_duration: 12010, + self_host_duration: 425, + self_device_duration: 0 + }, + { + name: 'aten::relu_', + calls: 98, + host_duration: 3880, + device_duration: 12010, + self_host_duration: 1266, + self_device_duration: 0 + }, + { + name: 'aten::max_pool2d_with_indices', + calls: 2, + host_duration: 112, + device_duration: 938, + self_host_duration: 82, + self_device_duration: 938 + }, + { + name: 'aten::max_pool2d', + calls: 2, + host_duration: 127, + device_duration: 938, + self_host_duration: 15, + self_device_duration: 0 + }, + { + name: 'aten::add_', + calls: 998, + host_duration: 21459, + device_duration: 13178, + self_host_duration: 11041, + self_device_duration: 13178 + }, + { + name: 'aten::mean', + calls: 2, + host_duration: 104, + device_duration: 126, + self_host_duration: 76, + self_device_duration: 126 + }, + { + name: 'aten::adaptive_avg_pool2d', + calls: 2, + host_duration: 117, + device_duration: 126, + self_host_duration: 13, + self_device_duration: 0 + }, + { + name: 'aten::_reshape_alias', + calls: 4, + host_duration: 26, + device_duration: 0, + self_host_duration: 26, + self_device_duration: 0 + }, + { + name: 'aten::flatten', + calls: 2, + host_duration: 31, + device_duration: 0, + self_host_duration: 15, + self_device_duration: 0 + }, + { + name: 'aten::transpose', + calls: 10, + host_duration: 85, + device_duration: 0, + self_host_duration: 68, + self_device_duration: 0 + }, + { + name: 'aten::t', + calls: 10, + host_duration: 145, + device_duration: 0, + self_host_duration: 60, + self_device_duration: 0 + }, + { + name: 'aten::expand', + calls: 4, + host_duration: 30, + device_duration: 0, + self_host_duration: 25, + self_device_duration: 0 + }, + { + name: 'aten::addmm', + calls: 2, + host_duration: 334, + device_duration: 84, + self_host_duration: 234, + self_device_duration: 84 + }, + { + name: 'aten::linear', + calls: 2, + host_duration: 386, + device_duration: 84, + self_host_duration: 19, + self_device_duration: 0 + }, + { + name: 'aten::_log_softmax', + calls: 2, + host_duration: 83, + device_duration: 14, + self_host_duration: 55, + self_device_duration: 14 + }, + { + name: 'aten::log_softmax', + calls: 2, + host_duration: 106, + device_duration: 14, + self_host_duration: 20, + self_device_duration: 0 + }, + { + name: 'aten::nll_loss_forward', + calls: 2, + host_duration: 96, + device_duration: 8, + self_host_duration: 68, + self_device_duration: 8 + }, + { + name: 'aten::nll_loss', + calls: 2, + host_duration: 105, + device_duration: 8, + self_host_duration: 9, + self_device_duration: 0 + }, + { + name: 'aten::nll_loss_nd', + calls: 2, + host_duration: 113, + device_duration: 8, + self_host_duration: 8, + self_device_duration: 0 + }, + { + name: 'aten::cross_entropy_loss', + calls: 2, + host_duration: 243, + device_duration: 22, + self_host_duration: 24, + self_device_duration: 0 + }, + { + name: 'aten::fill_', + calls: 328, + host_duration: 4140, + device_duration: 866, + self_host_duration: 1851, + self_device_duration: 866 + }, + { + name: 'aten::ones_like', + calls: 2, + host_duration: 104, + device_duration: 2, + self_host_duration: 14, + self_device_duration: 0 + }, + { + name: 'aten::nll_loss_backward', + calls: 2, + host_duration: 192, + device_duration: 9, + self_host_duration: 84, + self_device_duration: 6 + }, + { + name: 'NllLossBackward0', + calls: 2, + host_duration: 297, + device_duration: 9, + self_host_duration: 105, + self_device_duration: 0 + }, + { + name: 'autograd::engine::evaluate_function: NllLossBackward0', + calls: 2, + host_duration: 352, + device_duration: 9, + self_host_duration: 55, + self_device_duration: 0 + }, + { + name: 'aten::_log_softmax_backward_data', + calls: 2, + host_duration: 71, + device_duration: 18, + self_host_duration: 43, + self_device_duration: 18 + }, + { + name: 'LogSoftmaxBackward0', + calls: 2, + host_duration: 91, + device_duration: 18, + self_host_duration: 20, + self_device_duration: 0 + }, + { + name: 'autograd::engine::evaluate_function: LogSoftmaxBackward0', + calls: 2, + host_duration: 126, + device_duration: 18, + self_host_duration: 35, + self_device_duration: 0 + }, + { + name: 'aten::mm', + calls: 4, + host_duration: 283, + device_duration: 134, + self_host_duration: 186, + self_device_duration: 134 + }, + { + name: 'AddmmBackward0', + calls: 2, + host_duration: 418, + device_duration: 134, + self_host_duration: 47, + self_device_duration: 0 + }, + { + name: 'aten::sum', + calls: 2, + host_duration: 92, + device_duration: 14, + self_host_duration: 62, + self_device_duration: 14 + }, + { + name: 'autograd::engine::evaluate_function: AddmmBackward0', + calls: 2, + host_duration: 594, + device_duration: 148, + self_host_duration: 75, + self_device_duration: 0 + }, + { + name: 'torch::autograd::AccumulateGrad', + calls: 322, + host_duration: 10317, + device_duration: 1069, + self_host_duration: 2127, + self_device_duration: 0 + }, + { + name: + 'autograd::engine::evaluate_function: torch::autograd::AccumulateGrad', + calls: 322, + host_duration: 15128, + device_duration: 1069, + self_host_duration: 4811, + self_device_duration: 0 + }, + { + name: 'TBackward0', + calls: 2, + host_duration: 30, + device_duration: 0, + self_host_duration: 6, + self_device_duration: 0 + }, + { + name: 'autograd::engine::evaluate_function: TBackward0', + calls: 2, + host_duration: 45, + device_duration: 0, + self_host_duration: 15, + self_device_duration: 0 + }, + { + name: 'aten::reshape', + calls: 2, + host_duration: 20, + device_duration: 0, + self_host_duration: 10, + self_device_duration: 0 + }, + { + name: 'ReshapeAliasBackward0', + calls: 2, + host_duration: 31, + device_duration: 0, + self_host_duration: 11, + self_device_duration: 0 + }, + { + name: 'autograd::engine::evaluate_function: ReshapeAliasBackward0', + calls: 2, + host_duration: 48, + device_duration: 0, + self_host_duration: 17, + self_device_duration: 0 + }, + { + name: 'MeanBackward1', + calls: 2, + host_duration: 172, + device_duration: 75, + self_host_duration: 18, + self_device_duration: 0 + }, + { + name: 'autograd::engine::evaluate_function: MeanBackward1', + calls: 2, + host_duration: 201, + device_duration: 75, + self_host_duration: 29, + self_device_duration: 0 + }, + { + name: 'aten::threshold_backward', + calls: 98, + host_duration: 3652, + device_duration: 18018, + self_host_duration: 2361, + self_device_duration: 18018 + }, + { + name: 'ReluBackward0', + calls: 98, + host_duration: 4567, + device_duration: 18018, + self_host_duration: 915, + self_device_duration: 0 + }, + { + name: 'autograd::engine::evaluate_function: ReluBackward0', + calls: 98, + host_duration: 6457, + device_duration: 18018, + self_host_duration: 1890, + self_device_duration: 0 + }, + { + name: 'AddBackward0', + calls: 32, + host_duration: 26, + device_duration: 0, + self_host_duration: 26, + self_device_duration: 0 + }, + { + name: 'autograd::engine::evaluate_function: AddBackward0', + calls: 32, + host_duration: 261, + device_duration: 0, + self_host_duration: 235, + self_device_duration: 0 + }, + { + name: 'aten::cudnn_batch_norm_backward', + calls: 106, + host_duration: 9943, + device_duration: 44401, + self_host_duration: 4355, + self_device_duration: 44401 + }, + { + name: 'CudnnBatchNormBackward0', + calls: 106, + host_duration: 11132, + device_duration: 44401, + self_host_duration: 1189, + self_device_duration: 0 + }, + { + name: + 'autograd::engine::evaluate_function: CudnnBatchNormBackward0', + calls: 106, + host_duration: 14696, + device_duration: 44401, + self_host_duration: 3564, + self_device_duration: 0 + }, + { + name: 'aten::cudnn_convolution_backward_input', + calls: 104, + host_duration: 18813, + device_duration: 75568, + self_host_duration: 13997, + self_device_duration: 75568 + }, + { + name: 'aten::cudnn_convolution_backward_weight', + calls: 106, + host_duration: 18792, + device_duration: 88992, + self_host_duration: 11101, + self_device_duration: 88992 + }, + { + name: 'aten::cudnn_convolution_backward', + calls: 106, + host_duration: 40064, + device_duration: 164560, + self_host_duration: 2459, + self_device_duration: 0 + }, + { + name: 'CudnnConvolutionBackward0', + calls: 106, + host_duration: 41205, + device_duration: 164560, + self_host_duration: 1141, + self_device_duration: 0 + }, + { + name: + 'autograd::engine::evaluate_function: CudnnConvolutionBackward0', + calls: 106, + host_duration: 45209, + device_duration: 175014, + self_host_duration: 2826, + self_device_duration: 0 + }, + { + name: 'aten::max_pool2d_with_indices_backward', + calls: 2, + host_duration: 145, + device_duration: 3016, + self_host_duration: 61, + self_device_duration: 2556 + }, + { + name: 'MaxPool2DWithIndicesBackward0', + calls: 2, + host_duration: 165, + device_duration: 3016, + self_host_duration: 20, + self_device_duration: 0 + }, + { + name: + 'autograd::engine::evaluate_function: MaxPool2DWithIndicesBackward0', + calls: 2, + host_duration: 209, + device_duration: 3016, + self_host_duration: 44, + self_device_duration: 0 + }, + { + name: 'aten::mul_', + calls: 322, + host_duration: 6835, + device_duration: 803, + self_host_duration: 3630, + self_device_duration: 803 + } + ] + }, + path: '0', + children: [ + { + left: { + name: 'multiple nodes', + duration: 168, + device_duration: 0, + total_duration: 168, + aggs: [ + { + name: 'aten::empty', + calls: 2, + host_duration: 100, + device_duration: 0, + self_host_duration: 100, + self_device_duration: 0 + }, + { + name: 'aten::zero_', + calls: 1, + host_duration: 4, + device_duration: 0, + self_host_duration: 4, + self_device_duration: 0 + }, + { + name: 'aten::zeros', + calls: 1, + host_duration: 119, + device_duration: 0, + self_host_duration: 64, + self_device_duration: 0 + } + ] + }, + right: { + name: 'multiple nodes', + duration: 24, + device_duration: 0, + total_duration: 24, + aggs: [ + { + name: 'aten::empty', + calls: 2, + host_duration: 17, + device_duration: 0, + self_host_duration: 17, + self_device_duration: 0 + }, + { + name: 'aten::zero_', + calls: 1, + host_duration: 1, + device_duration: 0, + self_host_duration: 1, + self_device_duration: 0 + }, + { + name: 'aten::zeros', + calls: 1, + host_duration: 15, + device_duration: 0, + self_host_duration: 6, + self_device_duration: 0 + } + ] + }, + path: '0-0' + }, + { + left: { + name: 'enumerate(DataLoader)#_SingleProcessDataLoaderIter.__next__', + duration: 1766103, + device_duration: 0, + total_duration: 1766103, + aggs: [ + { + name: 'aten::empty', + calls: 1413, + host_duration: 62288, + device_duration: 0, + self_host_duration: 62288, + self_device_duration: 0 + }, + { + name: 'aten::zero_', + calls: 257, + host_duration: 959, + device_duration: 0, + self_host_duration: 959, + self_device_duration: 0 + }, + { + name: 'aten::zeros', + calls: 257, + host_duration: 35273, + device_duration: 0, + self_host_duration: 16154, + self_device_duration: 0 + }, + { + name: 'aten::to', + calls: 1344, + host_duration: 877101, + device_duration: 0, + self_host_duration: 18482, + self_device_duration: 0 + }, + { + name: 'detach', + calls: 128, + host_duration: 2191, + device_duration: 0, + self_host_duration: 2191, + self_device_duration: 0 + }, + { + name: 'aten::detach', + calls: 128, + host_duration: 5301, + device_duration: 0, + self_host_duration: 3110, + self_device_duration: 0 + }, + { + name: 'aten::as_strided', + calls: 450, + host_duration: 4175, + device_duration: 0, + self_host_duration: 4175, + self_device_duration: 0 + }, + { + name: 'aten::unsqueeze', + calls: 192, + host_duration: 9560, + device_duration: 0, + self_host_duration: 8045, + self_device_duration: 0 + }, + { + name: 'aten::empty_strided', + calls: 576, + host_duration: 24689, + device_duration: 0, + self_host_duration: 24689, + self_device_duration: 0 + }, + { + name: 'aten::copy_', + calls: 704, + host_duration: 780214, + device_duration: 0, + self_host_duration: 780214, + self_device_duration: 0 + }, + { + name: 'aten::_to_copy', + calls: 640, + host_duration: 858619, + device_duration: 0, + self_host_duration: 53009, + self_device_duration: 0 + }, + { + name: 'aten::upsample_bilinear2d', + calls: 64, + host_duration: 224031, + device_duration: 0, + self_host_duration: 204660, + self_device_duration: 0 + }, + { + name: 'aten::squeeze', + calls: 64, + host_duration: 4719, + device_duration: 0, + self_host_duration: 4119, + self_device_duration: 0 + }, + { + name: 'aten::round', + calls: 64, + host_duration: 16028, + device_duration: 0, + self_host_duration: 16028, + self_device_duration: 0 + }, + { + name: 'aten::slice', + calls: 130, + host_duration: 8918, + device_duration: 0, + self_host_duration: 7569, + self_device_duration: 0 + }, + { + name: 'detach_', + calls: 256, + host_duration: 2092, + device_duration: 0, + self_host_duration: 2092, + self_device_duration: 0 + }, + { + name: 'aten::detach_', + calls: 256, + host_duration: 7228, + device_duration: 0, + self_host_duration: 5136, + self_device_duration: 0 + }, + { + name: 'aten::result_type', + calls: 320, + host_duration: 884, + device_duration: 0, + self_host_duration: 884, + self_device_duration: 0 + }, + { + name: 'aten::pow', + calls: 320, + host_duration: 43030, + device_duration: 0, + self_host_duration: 39068, + self_device_duration: 0 + }, + { + name: 'aten::sub', + calls: 320, + host_duration: 91440, + device_duration: 0, + self_host_duration: 37676, + self_device_duration: 0 + }, + { + name: 'aten::gt', + calls: 320, + host_duration: 35514, + device_duration: 0, + self_host_duration: 24706, + self_device_duration: 0 + }, + { + name: 'aten::_local_scalar_dense', + calls: 384, + host_duration: 2467, + device_duration: 0, + self_host_duration: 2467, + self_device_duration: 0 + }, + { + name: 'aten::item', + calls: 384, + host_duration: 10375, + device_duration: 0, + self_host_duration: 7908, + self_device_duration: 0 + }, + { + name: 'aten::is_nonzero', + calls: 320, + host_duration: 13905, + device_duration: 0, + self_host_duration: 5383, + self_device_duration: 0 + }, + { + name: 'aten::div', + calls: 64, + host_duration: 87841, + device_duration: 0, + self_host_duration: 76794, + self_device_duration: 0 + }, + { + name: 'aten::resize_', + calls: 2, + host_duration: 117, + device_duration: 0, + self_host_duration: 117, + self_device_duration: 0 + }, + { + name: 'aten::narrow', + calls: 2, + host_duration: 142, + device_duration: 0, + self_host_duration: 51, + self_device_duration: 0 + }, + { + name: 'aten::_cat', + calls: 2, + host_duration: 51526, + device_duration: 0, + self_host_duration: 51229, + self_device_duration: 0 + }, + { + name: 'aten::cat', + calls: 2, + host_duration: 51674, + device_duration: 0, + self_host_duration: 148, + self_device_duration: 0 + }, + { + name: 'aten::stack', + calls: 2, + host_duration: 75677, + device_duration: 0, + self_host_duration: 19330, + self_device_duration: 0 + } + ] + }, + right: { + name: 'enumerate(DataLoader)#_SingleProcessDataLoaderIter.__next__', + duration: 146745, + device_duration: 0, + total_duration: 146745, + aggs: [ + { + name: 'aten::empty', + calls: 1413, + host_duration: 12399, + device_duration: 0, + self_host_duration: 12399, + self_device_duration: 0 + }, + { + name: 'aten::zero_', + calls: 257, + host_duration: 98, + device_duration: 0, + self_host_duration: 98, + self_device_duration: 0 + }, + { + name: 'aten::zeros', + calls: 257, + host_duration: 7665, + device_duration: 0, + self_host_duration: 1689, + self_device_duration: 0 + }, + { + name: 'aten::to', + calls: 1344, + host_duration: 21137, + device_duration: 0, + self_host_duration: 2377, + self_device_duration: 0 + }, + { + name: 'detach', + calls: 128, + host_duration: 364, + device_duration: 0, + self_host_duration: 361, + self_device_duration: 0 + }, + { + name: 'aten::detach', + calls: 128, + host_duration: 745, + device_duration: 0, + self_host_duration: 384, + self_device_duration: 0 + }, + { + name: 'aten::as_strided', + calls: 450, + host_duration: 527, + device_duration: 0, + self_host_duration: 527, + self_device_duration: 0 + }, + { + name: 'aten::unsqueeze', + calls: 192, + host_duration: 1050, + device_duration: 0, + self_host_duration: 869, + self_device_duration: 0 + }, + { + name: 'aten::empty_strided', + calls: 576, + host_duration: 3689, + device_duration: 0, + self_host_duration: 3689, + self_device_duration: 0 + }, + { + name: 'aten::copy_', + calls: 704, + host_duration: 8695, + device_duration: 0, + self_host_duration: 8695, + self_device_duration: 0 + }, + { + name: 'aten::_to_copy', + calls: 640, + host_duration: 18760, + device_duration: 0, + self_host_duration: 6122, + self_device_duration: 0 + }, + { + name: 'aten::upsample_bilinear2d', + calls: 64, + host_duration: 20349, + device_duration: 0, + self_host_duration: 17634, + self_device_duration: 0 + }, + { + name: 'aten::squeeze', + calls: 64, + host_duration: 562, + device_duration: 0, + self_host_duration: 487, + self_device_duration: 0 + }, + { + name: 'aten::round', + calls: 64, + host_duration: 6658, + device_duration: 0, + self_host_duration: 6658, + self_device_duration: 0 + }, + { + name: 'aten::slice', + calls: 130, + host_duration: 1028, + device_duration: 0, + self_host_duration: 870, + self_device_duration: 0 + }, + { + name: 'detach_', + calls: 256, + host_duration: 142, + device_duration: 0, + self_host_duration: 129, + self_device_duration: 0 + }, + { + name: 'aten::detach_', + calls: 256, + host_duration: 755, + device_duration: 0, + self_host_duration: 626, + self_device_duration: 0 + }, + { + name: 'aten::result_type', + calls: 320, + host_duration: 168, + device_duration: 0, + self_host_duration: 168, + self_device_duration: 0 + }, + { + name: 'aten::pow', + calls: 320, + host_duration: 4922, + device_duration: 0, + self_host_duration: 4440, + self_device_duration: 0 + }, + { + name: 'aten::sub', + calls: 320, + host_duration: 9959, + device_duration: 0, + self_host_duration: 4339, + self_device_duration: 0 + }, + { + name: 'aten::gt', + calls: 320, + host_duration: 3848, + device_duration: 0, + self_host_duration: 2737, + self_device_duration: 0 + }, + { + name: 'aten::_local_scalar_dense', + calls: 384, + host_duration: 209, + device_duration: 0, + self_host_duration: 209, + self_device_duration: 0 + }, + { + name: 'aten::item', + calls: 384, + host_duration: 1398, + device_duration: 0, + self_host_duration: 1187, + self_device_duration: 0 + }, + { + name: 'aten::is_nonzero', + calls: 320, + host_duration: 2013, + device_duration: 0, + self_host_duration: 812, + self_device_duration: 0 + }, + { + name: 'aten::div', + calls: 64, + host_duration: 7421, + device_duration: 0, + self_host_duration: 6234, + self_device_duration: 0 + }, + { + name: 'aten::resize_', + calls: 2, + host_duration: 36, + device_duration: 0, + self_host_duration: 36, + self_device_duration: 0 + }, + { + name: 'aten::narrow', + calls: 2, + host_duration: 19, + device_duration: 0, + self_host_duration: 9, + self_device_duration: 0 + }, + { + name: 'aten::_cat', + calls: 2, + host_duration: 4628, + device_duration: 0, + self_host_duration: 4566, + self_device_duration: 0 + }, + { + name: 'aten::cat', + calls: 2, + host_duration: 4649, + device_duration: 0, + self_host_duration: 21, + self_device_duration: 0 + }, + { + name: 'aten::stack', + calls: 2, + host_duration: 10884, + device_duration: 0, + self_host_duration: 5859, + self_device_duration: 0 + } + ] + }, + path: '0-1' + }, + { + left: { + name: 'multiple nodes', + duration: 5170, + device_duration: 4402, + total_duration: 4402, + aggs: [ + { + name: 'aten::empty_strided', + calls: 2, + host_duration: 209, + device_duration: 0, + self_host_duration: 209, + self_device_duration: 0 + }, + { + name: 'aten::copy_', + calls: 2, + host_duration: 4696, + device_duration: 4402, + self_host_duration: 93, + self_device_duration: 4402 + }, + { + name: 'aten::_to_copy', + calls: 2, + host_duration: 5111, + device_duration: 4402, + self_host_duration: 206, + self_device_duration: 0 + }, + { + name: 'aten::to', + calls: 2, + host_duration: 5170, + device_duration: 4402, + self_host_duration: 59, + self_device_duration: 0 + } + ] + }, + right: { + name: 'multiple nodes', + duration: 4681, + device_duration: 4350, + total_duration: 4350, + aggs: [ + { + name: 'aten::empty_strided', + calls: 2, + host_duration: 65, + device_duration: 0, + self_host_duration: 65, + self_device_duration: 0 + }, + { + name: 'aten::copy_', + calls: 2, + host_duration: 4575, + device_duration: 4350, + self_host_duration: 26, + self_device_duration: 4350 + }, + { + name: 'aten::_to_copy', + calls: 2, + host_duration: 4670, + device_duration: 4350, + self_host_duration: 30, + self_device_duration: 0 + }, + { + name: 'aten::to', + calls: 2, + host_duration: 4681, + device_duration: 4350, + self_host_duration: 11, + self_device_duration: 0 + } + ] + }, + path: '0-2' + }, + { + left: { + name: 'nn.Module: ResNet', + duration: 113664, + device_duration: 61356, + total_duration: 61356, + aggs: [ + { + name: 'aten::empty', + calls: 318, + host_duration: 14161, + device_duration: 0, + self_host_duration: 14161, + self_device_duration: 0 + }, + { + name: 'aten::cudnn_convolution', + calls: 53, + host_duration: 22091, + device_duration: 36599, + self_host_duration: 17567, + self_device_duration: 36599 + }, + { + name: 'aten::_convolution', + calls: 53, + host_duration: 25744, + device_duration: 36599, + self_host_duration: 3653, + self_device_duration: 0 + }, + { + name: 'aten::convolution', + calls: 53, + host_duration: 27753, + device_duration: 36599, + self_host_duration: 2009, + self_device_duration: 0 + }, + { + name: 'aten::conv2d', + calls: 53, + host_duration: 29777, + device_duration: 36599, + self_host_duration: 2024, + self_device_duration: 0 + }, + { + name: 'aten::add', + calls: 53, + host_duration: 6519, + device_duration: 54, + self_host_duration: 5666, + self_device_duration: 54 + }, + { + name: 'aten::empty_like', + calls: 53, + host_duration: 5624, + device_duration: 0, + self_host_duration: 2390, + self_device_duration: 0 + }, + { + name: 'aten::view', + calls: 53, + host_duration: 826, + device_duration: 0, + self_host_duration: 826, + self_device_duration: 0 + }, + { + name: 'aten::cudnn_batch_norm', + calls: 53, + host_duration: 35818, + device_duration: 12974, + self_host_duration: 20557, + self_device_duration: 12974 + }, + { + name: 'aten::_batch_norm_impl_index', + calls: 53, + host_duration: 38324, + device_duration: 12974, + self_host_duration: 2506, + self_device_duration: 0 + }, + { + name: 'aten::batch_norm', + calls: 53, + host_duration: 40105, + device_duration: 12974, + self_host_duration: 1781, + self_device_duration: 0 + }, + { + name: 'aten::clamp_min', + calls: 49, + host_duration: 2702, + device_duration: 6002, + self_host_duration: 1935, + self_device_duration: 6002 + }, + { + name: 'aten::clamp_min_', + calls: 49, + host_duration: 4273, + device_duration: 6002, + self_host_duration: 1571, + self_device_duration: 0 + }, + { + name: 'aten::relu_', + calls: 49, + host_duration: 8371, + device_duration: 6002, + self_host_duration: 4098, + self_device_duration: 0 + }, + { + name: 'aten::max_pool2d_with_indices', + calls: 1, + host_duration: 230, + device_duration: 474, + self_host_duration: 212, + self_device_duration: 474 + }, + { + name: 'aten::max_pool2d', + calls: 1, + host_duration: 280, + device_duration: 474, + self_host_duration: 50, + self_device_duration: 0 + }, + { + name: 'aten::add_', + calls: 16, + host_duration: 1546, + device_duration: 5141, + self_host_duration: 1290, + self_device_duration: 5141 + }, + { + name: 'aten::mean', + calls: 1, + host_duration: 189, + device_duration: 69, + self_host_duration: 170, + self_device_duration: 69 + }, + { + name: 'aten::adaptive_avg_pool2d', + calls: 1, + host_duration: 234, + device_duration: 69, + self_host_duration: 45, + self_device_duration: 0 + }, + { + name: 'aten::_reshape_alias', + calls: 1, + host_duration: 52, + device_duration: 0, + self_host_duration: 52, + self_device_duration: 0 + }, + { + name: 'aten::flatten', + calls: 1, + host_duration: 106, + device_duration: 0, + self_host_duration: 54, + self_device_duration: 0 + }, + { + name: 'aten::as_strided', + calls: 2, + host_duration: 23, + device_duration: 0, + self_host_duration: 23, + self_device_duration: 0 + }, + { + name: 'aten::transpose', + calls: 1, + host_duration: 55, + device_duration: 0, + self_host_duration: 41, + self_device_duration: 0 + }, + { + name: 'aten::t', + calls: 1, + host_duration: 119, + device_duration: 0, + self_host_duration: 64, + self_device_duration: 0 + }, + { + name: 'aten::expand', + calls: 1, + host_duration: 49, + device_duration: 0, + self_host_duration: 40, + self_device_duration: 0 + }, + { + name: 'aten::addmm', + calls: 1, + host_duration: 404, + device_duration: 43, + self_host_duration: 302, + self_device_duration: 43 + }, + { + name: 'aten::linear', + calls: 1, + host_duration: 591, + device_duration: 43, + self_host_duration: 68, + self_device_duration: 0 + } + ] + }, + right: { + name: 'nn.Module: ResNet', + duration: 28725, + device_duration: 60899, + total_duration: 60899, + aggs: [ + { + name: 'aten::empty', + calls: 318, + host_duration: 2292, + device_duration: 0, + self_host_duration: 2292, + self_device_duration: 0 + }, + { + name: 'aten::cudnn_convolution', + calls: 53, + host_duration: 8713, + device_duration: 36205, + self_host_duration: 6819, + self_device_duration: 36205 + }, + { + name: 'aten::_convolution', + calls: 53, + host_duration: 9298, + device_duration: 36205, + self_host_duration: 585, + self_device_duration: 0 + }, + { + name: 'aten::convolution', + calls: 53, + host_duration: 9653, + device_duration: 36205, + self_host_duration: 355, + self_device_duration: 0 + }, + { + name: 'aten::conv2d', + calls: 53, + host_duration: 9932, + device_duration: 36205, + self_host_duration: 279, + self_device_duration: 0 + }, + { + name: 'aten::add', + calls: 53, + host_duration: 1897, + device_duration: 58, + self_host_duration: 1201, + self_device_duration: 58 + }, + { + name: 'aten::empty_like', + calls: 53, + host_duration: 933, + device_duration: 0, + self_host_duration: 284, + self_device_duration: 0 + }, + { + name: 'aten::view', + calls: 53, + host_duration: 130, + device_duration: 0, + self_host_duration: 130, + self_device_duration: 0 + }, + { + name: 'aten::cudnn_batch_norm', + calls: 53, + host_duration: 5540, + device_duration: 12913, + self_host_duration: 2504, + self_device_duration: 12913 + }, + { + name: 'aten::_batch_norm_impl_index', + calls: 53, + host_duration: 5942, + device_duration: 12913, + self_host_duration: 402, + self_device_duration: 0 + }, + { + name: 'aten::batch_norm', + calls: 53, + host_duration: 6219, + device_duration: 12913, + self_host_duration: 277, + self_device_duration: 0 + }, + { + name: 'aten::clamp_min', + calls: 49, + host_duration: 1108, + device_duration: 6006, + self_host_duration: 523, + self_device_duration: 6006 + }, + { + name: 'aten::clamp_min_', + calls: 49, + host_duration: 1315, + device_duration: 6006, + self_host_duration: 207, + self_device_duration: 0 + }, + { + name: 'aten::relu_', + calls: 49, + host_duration: 1939, + device_duration: 6006, + self_host_duration: 624, + self_device_duration: 0 + }, + { + name: 'aten::max_pool2d_with_indices', + calls: 1, + host_duration: 53, + device_duration: 472, + self_host_duration: 38, + self_device_duration: 472 + }, + { + name: 'aten::max_pool2d', + calls: 1, + host_duration: 61, + device_duration: 472, + self_host_duration: 8, + self_device_duration: 0 + }, + { + name: 'aten::add_', + calls: 16, + host_duration: 448, + device_duration: 5140, + self_host_duration: 268, + self_device_duration: 5140 + }, + { + name: 'aten::mean', + calls: 1, + host_duration: 53, + device_duration: 63, + self_host_duration: 39, + self_device_duration: 63 + }, + { + name: 'aten::adaptive_avg_pool2d', + calls: 1, + host_duration: 59, + device_duration: 63, + self_host_duration: 6, + self_device_duration: 0 + }, + { + name: 'aten::_reshape_alias', + calls: 1, + host_duration: 8, + device_duration: 0, + self_host_duration: 8, + self_device_duration: 0 + }, + { + name: 'aten::flatten', + calls: 1, + host_duration: 15, + device_duration: 0, + self_host_duration: 7, + self_device_duration: 0 + }, + { + name: 'aten::as_strided', + calls: 2, + host_duration: 3, + device_duration: 0, + self_host_duration: 3, + self_device_duration: 0 + }, + { + name: 'aten::transpose', + calls: 1, + host_duration: 8, + device_duration: 0, + self_host_duration: 6, + self_device_duration: 0 + }, + { + name: 'aten::t', + calls: 1, + host_duration: 15, + device_duration: 0, + self_host_duration: 7, + self_device_duration: 0 + }, + { + name: 'aten::expand', + calls: 1, + host_duration: 6, + device_duration: 0, + self_host_duration: 5, + self_device_duration: 0 + }, + { + name: 'aten::addmm', + calls: 1, + host_duration: 173, + device_duration: 42, + self_host_duration: 123, + self_device_duration: 42 + }, + { + name: 'aten::linear', + calls: 1, + host_duration: 198, + device_duration: 42, + self_host_duration: 10, + self_device_duration: 0 + } + ] + }, + path: '0-3' + }, + { + left: { + name: 'nn.Module: CrossEntropyLoss', + duration: 711, + device_duration: 11, + total_duration: 11, + aggs: [ + { + name: 'aten::to', + calls: 1, + host_duration: 5, + device_duration: 0, + self_host_duration: 5, + self_device_duration: 0 + }, + { + name: 'aten::_log_softmax', + calls: 1, + host_duration: 158, + device_duration: 7, + self_host_duration: 139, + self_device_duration: 7 + }, + { + name: 'aten::log_softmax', + calls: 1, + host_duration: 241, + device_duration: 7, + self_host_duration: 78, + self_device_duration: 0 + }, + { + name: 'aten::resize_', + calls: 1, + host_duration: 5, + device_duration: 0, + self_host_duration: 5, + self_device_duration: 0 + }, + { + name: 'aten::nll_loss_forward', + calls: 1, + host_duration: 256, + device_duration: 4, + self_host_duration: 233, + self_device_duration: 4 + }, + { + name: 'aten::nll_loss', + calls: 1, + host_duration: 290, + device_duration: 4, + self_host_duration: 34, + self_device_duration: 0 + }, + { + name: 'aten::nll_loss_nd', + calls: 1, + host_duration: 313, + device_duration: 4, + self_host_duration: 23, + self_device_duration: 0 + }, + { + name: 'aten::cross_entropy_loss', + calls: 1, + host_duration: 614, + device_duration: 11, + self_host_duration: 60, + self_device_duration: 0 + } + ] + }, + right: { + name: 'nn.Module: CrossEntropyLoss', + duration: 156, + device_duration: 11, + total_duration: 11, + aggs: [ + { + name: 'aten::to', + calls: 1, + host_duration: 2, + device_duration: 0, + self_host_duration: 2, + self_device_duration: 0 + }, + { + name: 'aten::_log_softmax', + calls: 1, + host_duration: 42, + device_duration: 7, + self_host_duration: 28, + self_device_duration: 7 + }, + { + name: 'aten::log_softmax', + calls: 1, + host_duration: 54, + device_duration: 7, + self_host_duration: 10, + self_device_duration: 0 + }, + { + name: 'aten::resize_', + calls: 1, + host_duration: 0, + device_duration: 0, + self_host_duration: 0, + self_device_duration: 0 + }, + { + name: 'aten::nll_loss_forward', + calls: 1, + host_duration: 47, + device_duration: 4, + self_host_duration: 34, + self_device_duration: 4 + }, + { + name: 'aten::nll_loss', + calls: 1, + host_duration: 52, + device_duration: 4, + self_host_duration: 5, + self_device_duration: 0 + }, + { + name: 'aten::nll_loss_nd', + calls: 1, + host_duration: 56, + device_duration: 4, + self_host_duration: 4, + self_device_duration: 0 + }, + { + name: 'aten::cross_entropy_loss', + calls: 1, + host_duration: 119, + device_duration: 11, + self_host_duration: 9, + self_device_duration: 0 + } + ] + }, + path: '0-4' + }, + { + left: { + name: 'aten::zeros', + duration: 119, + device_duration: 0, + total_duration: 119, + aggs: [ + { + name: 'aten::empty', + calls: 1, + host_duration: 47, + device_duration: 0, + self_host_duration: 47, + self_device_duration: 0 + }, + { + name: 'aten::zero_', + calls: 1, + host_duration: 4, + device_duration: 0, + self_host_duration: 4, + self_device_duration: 0 + }, + { + name: 'aten::zeros', + calls: 1, + host_duration: 119, + device_duration: 0, + self_host_duration: 68, + self_device_duration: 0 + } + ] + }, + right: { + name: 'aten::zeros', + duration: 17, + device_duration: 0, + total_duration: 17, + aggs: [ + { + name: 'aten::empty', + calls: 1, + host_duration: 8, + device_duration: 0, + self_host_duration: 8, + self_device_duration: 0 + }, + { + name: 'aten::zero_', + calls: 1, + host_duration: 2, + device_duration: 0, + self_host_duration: 2, + self_device_duration: 0 + }, + { + name: 'aten::zeros', + calls: 1, + host_duration: 17, + device_duration: 0, + self_host_duration: 7, + self_device_duration: 0 + } + ] + }, + path: '0-5' + }, + { + left: { + name: 'Optimizer.zero_grad#SGD.zero_grad', + duration: 22960, + device_duration: 142, + total_duration: 142, + aggs: [ + { + name: 'aten::empty', + calls: 1, + host_duration: 38, + device_duration: 0, + self_host_duration: 38, + self_device_duration: 0 + }, + { + name: 'aten::fill_', + calls: 161, + host_duration: 7097, + device_duration: 142, + self_host_duration: 4914, + self_device_duration: 142 + }, + { + name: 'aten::zero_', + calls: 161, + host_duration: 14725, + device_duration: 142, + self_host_duration: 7628, + self_device_duration: 0 + } + ] + }, + right: { + name: 'Optimizer.zero_grad#SGD.zero_grad', + duration: 4075, + device_duration: 264, + total_duration: 264, + aggs: [ + { + name: 'aten::empty', + calls: 1, + host_duration: 6, + device_duration: 0, + self_host_duration: 6, + self_device_duration: 0 + }, + { + name: 'aten::fill_', + calls: 161, + host_duration: 2036, + device_duration: 264, + self_host_duration: 909, + self_device_duration: 264 + }, + { + name: 'aten::zero_', + calls: 161, + host_duration: 2855, + device_duration: 264, + self_host_duration: 819, + self_device_duration: 0 + } + ] + }, + path: '0-6' + }, + { + left: { + name: 'aten::ones_like', + duration: 253, + device_duration: 1, + total_duration: 1, + aggs: [ + { + name: 'aten::empty_strided', + calls: 1, + host_duration: 79, + device_duration: 0, + self_host_duration: 79, + self_device_duration: 0 + }, + { + name: 'aten::empty_like', + calls: 1, + host_duration: 126, + device_duration: 0, + self_host_duration: 47, + self_device_duration: 0 + }, + { + name: 'aten::fill_', + calls: 1, + host_duration: 50, + device_duration: 1, + self_host_duration: 35, + self_device_duration: 1 + }, + { + name: 'aten::ones_like', + calls: 1, + host_duration: 253, + device_duration: 1, + self_host_duration: 77, + self_device_duration: 0 + } + ] + }, + right: { + name: 'aten::ones_like', + duration: 53, + device_duration: 1, + total_duration: 1, + aggs: [ + { + name: 'aten::empty_strided', + calls: 1, + host_duration: 18, + device_duration: 0, + self_host_duration: 18, + self_device_duration: 0 + }, + { + name: 'aten::empty_like', + calls: 1, + host_duration: 26, + device_duration: 0, + self_host_duration: 8, + self_device_duration: 0 + }, + { + name: 'aten::fill_', + calls: 1, + host_duration: 20, + device_duration: 1, + self_host_duration: 8, + self_device_duration: 1 + }, + { + name: 'aten::ones_like', + calls: 1, + host_duration: 53, + device_duration: 1, + self_host_duration: 7, + self_device_duration: 0 + } + ] + }, + path: '0-7' + }, + { + left: { + name: 'nn.Module: CrossEntropyLoss.backward', + duration: 898, + device_duration: 13, + total_duration: 13, + aggs: [ + { + name: 'aten::fill_', + calls: 1, + host_duration: 69, + device_duration: 1, + self_host_duration: 43, + self_device_duration: 1 + }, + { + name: 'aten::zero_', + calls: 1, + host_duration: 120, + device_duration: 1, + self_host_duration: 51, + self_device_duration: 0 + }, + { + name: 'aten::nll_loss_backward', + calls: 1, + host_duration: 304, + device_duration: 4, + self_host_duration: 168, + self_device_duration: 3 + }, + { + name: 'NllLossBackward0', + calls: 1, + host_duration: 368, + device_duration: 4, + self_host_duration: 64, + self_device_duration: 0 + }, + { + name: 'autograd::engine::evaluate_function: NllLossBackward0', + calls: 1, + host_duration: 503, + device_duration: 4, + self_host_duration: 135, + self_device_duration: 0 + }, + { + name: 'aten::_log_softmax_backward_data', + calls: 1, + host_duration: 127, + device_duration: 9, + self_host_duration: 105, + self_device_duration: 9 + }, + { + name: 'LogSoftmaxBackward0', + calls: 1, + host_duration: 207, + device_duration: 9, + self_host_duration: 80, + self_device_duration: 0 + }, + { + name: + 'autograd::engine::evaluate_function: LogSoftmaxBackward0', + calls: 1, + host_duration: 349, + device_duration: 9, + self_host_duration: 142, + self_device_duration: 0 + } + ] + }, + right: { + name: 'nn.Module: CrossEntropyLoss.backward', + duration: 214, + device_duration: 14, + total_duration: 14, + aggs: [ + { + name: 'aten::fill_', + calls: 1, + host_duration: 36, + device_duration: 2, + self_host_duration: 13, + self_device_duration: 2 + }, + { + name: 'aten::zero_', + calls: 1, + host_duration: 45, + device_duration: 2, + self_host_duration: 9, + self_device_duration: 0 + }, + { + name: 'aten::nll_loss_backward', + calls: 1, + host_duration: 99, + device_duration: 5, + self_host_duration: 43, + self_device_duration: 3 + }, + { + name: 'NllLossBackward0', + calls: 1, + host_duration: 112, + device_duration: 5, + self_host_duration: 13, + self_device_duration: 0 + }, + { + name: 'autograd::engine::evaluate_function: NllLossBackward0', + calls: 1, + host_duration: 141, + device_duration: 5, + self_host_duration: 29, + self_device_duration: 0 + }, + { + name: 'aten::_log_softmax_backward_data', + calls: 1, + host_duration: 35, + device_duration: 9, + self_host_duration: 21, + self_device_duration: 9 + }, + { + name: 'LogSoftmaxBackward0', + calls: 1, + host_duration: 46, + device_duration: 9, + self_host_duration: 11, + self_device_duration: 0 + }, + { + name: + 'autograd::engine::evaluate_function: LogSoftmaxBackward0', + calls: 1, + host_duration: 64, + device_duration: 9, + self_host_duration: 18, + self_device_duration: 0 + } + ] + }, + path: '0-8' + }, + { + left: { + name: 'nn.Module: ResNet.backward', + duration: 180998, + device_duration: 123177, + total_duration: 123177, + aggs: [ + { + name: 'aten::as_strided', + calls: 5, + host_duration: 61, + device_duration: 0, + self_host_duration: 61, + self_device_duration: 0 + }, + { + name: 'aten::transpose', + calls: 4, + host_duration: 226, + device_duration: 0, + self_host_duration: 180, + self_device_duration: 0 + }, + { + name: 'aten::t', + calls: 4, + host_duration: 399, + device_duration: 0, + self_host_duration: 173, + self_device_duration: 0 + }, + { + name: 'aten::mm', + calls: 2, + host_duration: 345, + device_duration: 72, + self_host_duration: 282, + self_device_duration: 72 + }, + { + name: 'AddmmBackward0', + calls: 1, + host_duration: 854, + device_duration: 72, + self_host_duration: 208, + self_device_duration: 0 + }, + { + name: 'aten::sum', + calls: 1, + host_duration: 173, + device_duration: 8, + self_host_duration: 153, + self_device_duration: 8 + }, + { + name: 'aten::view', + calls: 54, + host_duration: 971, + device_duration: 0, + self_host_duration: 971, + self_device_duration: 0 + }, + { + name: 'autograd::engine::evaluate_function: AddmmBackward0', + calls: 1, + host_duration: 1333, + device_duration: 80, + self_host_duration: 271, + self_device_duration: 0 + }, + { + name: 'aten::add_', + calls: 161, + host_duration: 12621, + device_duration: 501, + self_host_duration: 9839, + self_device_duration: 501 + }, + { + name: 'torch::autograd::AccumulateGrad', + calls: 161, + host_duration: 20767, + device_duration: 501, + self_host_duration: 8146, + self_device_duration: 0 + }, + { + name: + 'autograd::engine::evaluate_function: torch::autograd::AccumulateGrad', + calls: 161, + host_duration: 35735, + device_duration: 501, + self_host_duration: 14968, + self_device_duration: 0 + }, + { + name: 'TBackward0', + calls: 1, + host_duration: 128, + device_duration: 0, + self_host_duration: 30, + self_device_duration: 0 + }, + { + name: 'autograd::engine::evaluate_function: TBackward0', + calls: 1, + host_duration: 197, + device_duration: 0, + self_host_duration: 69, + self_device_duration: 0 + }, + { + name: 'aten::_reshape_alias', + calls: 1, + host_duration: 31, + device_duration: 0, + self_host_duration: 31, + self_device_duration: 0 + }, + { + name: 'aten::reshape', + calls: 1, + host_duration: 79, + device_duration: 0, + self_host_duration: 48, + self_device_duration: 0 + }, + { + name: 'ReshapeAliasBackward0', + calls: 1, + host_duration: 131, + device_duration: 0, + self_host_duration: 52, + self_device_duration: 0 + }, + { + name: + 'autograd::engine::evaluate_function: ReshapeAliasBackward0', + calls: 1, + host_duration: 197, + device_duration: 0, + self_host_duration: 66, + self_device_duration: 0 + }, + { + name: 'aten::expand', + calls: 1, + host_duration: 84, + device_duration: 0, + self_host_duration: 69, + self_device_duration: 0 + }, + { + name: 'aten::to', + calls: 1, + host_duration: 6, + device_duration: 0, + self_host_duration: 6, + self_device_duration: 0 + }, + { + name: 'aten::div', + calls: 1, + host_duration: 289, + device_duration: 38, + self_host_duration: 267, + self_device_duration: 38 + }, + { + name: 'MeanBackward1', + calls: 1, + host_duration: 489, + device_duration: 38, + self_host_duration: 110, + self_device_duration: 0 + }, + { + name: 'autograd::engine::evaluate_function: MeanBackward1', + calls: 1, + host_duration: 592, + device_duration: 38, + self_host_duration: 103, + self_device_duration: 0 + }, + { + name: 'aten::threshold_backward', + calls: 49, + host_duration: 6958, + device_duration: 8972, + self_host_duration: 6094, + self_device_duration: 8972 + }, + { + name: 'ReluBackward0', + calls: 49, + host_duration: 10647, + device_duration: 8972, + self_host_duration: 3689, + self_device_duration: 0 + }, + { + name: 'autograd::engine::evaluate_function: ReluBackward0', + calls: 49, + host_duration: 16826, + device_duration: 8972, + self_host_duration: 6179, + self_device_duration: 0 + }, + { + name: 'AddBackward0', + calls: 16, + host_duration: 129, + device_duration: 0, + self_host_duration: 129, + self_device_duration: 0 + }, + { + name: 'autograd::engine::evaluate_function: AddBackward0', + calls: 16, + host_duration: 1301, + device_duration: 0, + self_host_duration: 1172, + self_device_duration: 0 + }, + { + name: 'aten::empty', + calls: 370, + host_duration: 20319, + device_duration: 0, + self_host_duration: 20319, + self_device_duration: 0 + }, + { + name: 'aten::cudnn_batch_norm_backward', + calls: 53, + host_duration: 31300, + device_duration: 22267, + self_host_duration: 18144, + self_device_duration: 22267 + }, + { + name: 'CudnnBatchNormBackward0', + calls: 53, + host_duration: 34805, + device_duration: 22267, + self_host_duration: 3505, + self_device_duration: 0 + }, + { + name: + 'autograd::engine::evaluate_function: CudnnBatchNormBackward0', + calls: 53, + host_duration: 44607, + device_duration: 22267, + self_host_duration: 9802, + self_device_duration: 0 + }, + { + name: 'aten::cudnn_convolution_backward_input', + calls: 52, + host_duration: 20324, + device_duration: 38733, + self_host_duration: 15252, + self_device_duration: 38733 + }, + { + name: 'aten::cudnn_convolution_backward_weight', + calls: 53, + host_duration: 21997, + device_duration: 45837, + self_host_duration: 13786, + self_device_duration: 45837 + }, + { + name: 'aten::cudnn_convolution_backward', + calls: 53, + host_duration: 50059, + device_duration: 84570, + self_host_duration: 7738, + self_device_duration: 0 + }, + { + name: 'CudnnConvolutionBackward0', + calls: 53, + host_duration: 53558, + device_duration: 84570, + self_host_duration: 3499, + self_device_duration: 0 + }, + { + name: + 'autograd::engine::evaluate_function: CudnnConvolutionBackward0', + calls: 53, + host_duration: 64252, + device_duration: 89775, + self_host_duration: 8462, + self_device_duration: 0 + }, + { + name: 'aten::add', + calls: 16, + host_duration: 2232, + device_duration: 5205, + self_host_duration: 1944, + self_device_duration: 5205 + }, + { + name: 'aten::fill_', + calls: 1, + host_duration: 61, + device_duration: 230, + self_host_duration: 44, + self_device_duration: 230 + }, + { + name: 'aten::zero_', + calls: 1, + host_duration: 104, + device_duration: 230, + self_host_duration: 43, + self_device_duration: 0 + }, + { + name: 'aten::max_pool2d_with_indices_backward', + calls: 1, + host_duration: 246, + device_duration: 1544, + self_host_duration: 128, + self_device_duration: 1314 + }, + { + name: 'MaxPool2DWithIndicesBackward0', + calls: 1, + host_duration: 304, + device_duration: 1544, + self_host_duration: 58, + self_device_duration: 0 + }, + { + name: + 'autograd::engine::evaluate_function: MaxPool2DWithIndicesBackward0', + calls: 1, + host_duration: 425, + device_duration: 1544, + self_host_duration: 121, + self_device_duration: 0 + } + ] + }, + right: { + name: 'nn.Module: ResNet.backward', + duration: 43714, + device_duration: 120604, + total_duration: 120604, + aggs: [ + { + name: 'aten::as_strided', + calls: 5, + host_duration: 9, + device_duration: 0, + self_host_duration: 9, + self_device_duration: 0 + }, + { + name: 'aten::transpose', + calls: 4, + host_duration: 38, + device_duration: 0, + self_host_duration: 31, + self_device_duration: 0 + }, + { + name: 'aten::t', + calls: 4, + host_duration: 59, + device_duration: 0, + self_host_duration: 21, + self_device_duration: 0 + }, + { + name: 'aten::mm', + calls: 2, + host_duration: 139, + device_duration: 67, + self_host_duration: 90, + self_device_duration: 67 + }, + { + name: 'AddmmBackward0', + calls: 1, + host_duration: 210, + device_duration: 67, + self_host_duration: 23, + self_device_duration: 0 + }, + { + name: 'aten::sum', + calls: 1, + host_duration: 47, + device_duration: 7, + self_host_duration: 32, + self_device_duration: 7 + }, + { + name: 'aten::view', + calls: 54, + host_duration: 166, + device_duration: 0, + self_host_duration: 166, + self_device_duration: 0 + }, + { + name: 'autograd::engine::evaluate_function: AddmmBackward0', + calls: 1, + host_duration: 299, + device_duration: 74, + self_host_duration: 37, + self_device_duration: 0 + }, + { + name: 'aten::add_', + calls: 161, + host_duration: 4087, + device_duration: 534, + self_host_duration: 2037, + self_device_duration: 534 + }, + { + name: 'torch::autograd::AccumulateGrad', + calls: 161, + host_duration: 5134, + device_duration: 534, + self_host_duration: 1047, + self_device_duration: 0 + }, + { + name: + 'autograd::engine::evaluate_function: torch::autograd::AccumulateGrad', + calls: 161, + host_duration: 7473, + device_duration: 534, + self_host_duration: 2339, + self_device_duration: 0 + }, + { + name: 'TBackward0', + calls: 1, + host_duration: 14, + device_duration: 0, + self_host_duration: 3, + self_device_duration: 0 + }, + { + name: 'autograd::engine::evaluate_function: TBackward0', + calls: 1, + host_duration: 21, + device_duration: 0, + self_host_duration: 7, + self_device_duration: 0 + }, + { + name: 'aten::_reshape_alias', + calls: 1, + host_duration: 5, + device_duration: 0, + self_host_duration: 5, + self_device_duration: 0 + }, + { + name: 'aten::reshape', + calls: 1, + host_duration: 10, + device_duration: 0, + self_host_duration: 5, + self_device_duration: 0 + }, + { + name: 'ReshapeAliasBackward0', + calls: 1, + host_duration: 14, + device_duration: 0, + self_host_duration: 4, + self_device_duration: 0 + }, + { + name: + 'autograd::engine::evaluate_function: ReshapeAliasBackward0', + calls: 1, + host_duration: 21, + device_duration: 0, + self_host_duration: 7, + self_device_duration: 0 + }, + { + name: 'aten::expand', + calls: 1, + host_duration: 9, + device_duration: 0, + self_host_duration: 7, + self_device_duration: 0 + }, + { + name: 'aten::to', + calls: 1, + host_duration: 1, + device_duration: 0, + self_host_duration: 1, + self_device_duration: 0 + }, + { + name: 'aten::div', + calls: 1, + host_duration: 70, + device_duration: 38, + self_host_duration: 49, + self_device_duration: 38 + }, + { + name: 'MeanBackward1', + calls: 1, + host_duration: 89, + device_duration: 38, + self_host_duration: 9, + self_device_duration: 0 + }, + { + name: 'autograd::engine::evaluate_function: MeanBackward1', + calls: 1, + host_duration: 102, + device_duration: 38, + self_host_duration: 13, + self_device_duration: 0 + }, + { + name: 'aten::threshold_backward', + calls: 49, + host_duration: 1789, + device_duration: 9015, + self_host_duration: 1158, + self_device_duration: 9015 + }, + { + name: 'ReluBackward0', + calls: 49, + host_duration: 2237, + device_duration: 9015, + self_host_duration: 448, + self_device_duration: 0 + }, + { + name: 'autograd::engine::evaluate_function: ReluBackward0', + calls: 49, + host_duration: 3144, + device_duration: 9015, + self_host_duration: 907, + self_device_duration: 0 + }, + { + name: 'AddBackward0', + calls: 16, + host_duration: 12, + device_duration: 0, + self_host_duration: 12, + self_device_duration: 0 + }, + { + name: 'autograd::engine::evaluate_function: AddBackward0', + calls: 16, + host_duration: 126, + device_duration: 0, + self_host_duration: 114, + self_device_duration: 0 + }, + { + name: 'aten::empty', + calls: 370, + host_duration: 3292, + device_duration: 0, + self_host_duration: 3292, + self_device_duration: 0 + }, + { + name: 'aten::cudnn_batch_norm_backward', + calls: 53, + host_duration: 4896, + device_duration: 22157, + self_host_duration: 2136, + self_device_duration: 22157 + }, + { + name: 'CudnnBatchNormBackward0', + calls: 53, + host_duration: 5495, + device_duration: 22157, + self_host_duration: 599, + self_device_duration: 0 + }, + { + name: + 'autograd::engine::evaluate_function: CudnnBatchNormBackward0', + calls: 53, + host_duration: 7289, + device_duration: 22157, + self_host_duration: 1794, + self_device_duration: 0 + }, + { + name: 'aten::cudnn_convolution_backward_input', + calls: 52, + host_duration: 9468, + device_duration: 37714, + self_host_duration: 7052, + self_device_duration: 37714 + }, + { + name: 'aten::cudnn_convolution_backward_weight', + calls: 53, + host_duration: 8906, + device_duration: 44342, + self_host_duration: 5723, + self_device_duration: 44342 + }, + { + name: 'aten::cudnn_convolution_backward', + calls: 53, + host_duration: 19611, + device_duration: 82056, + self_host_duration: 1237, + self_device_duration: 0 + }, + { + name: 'CudnnConvolutionBackward0', + calls: 53, + host_duration: 20205, + device_duration: 82056, + self_host_duration: 594, + self_device_duration: 0 + }, + { + name: + 'autograd::engine::evaluate_function: CudnnConvolutionBackward0', + calls: 53, + host_duration: 22185, + device_duration: 87283, + self_host_duration: 1386, + self_device_duration: 0 + }, + { + name: 'aten::add', + calls: 16, + host_duration: 594, + device_duration: 5227, + self_host_duration: 380, + self_device_duration: 5227 + }, + { + name: 'aten::fill_', + calls: 1, + host_duration: 24, + device_duration: 230, + self_host_duration: 11, + self_device_duration: 230 + }, + { + name: 'aten::zero_', + calls: 1, + host_duration: 32, + device_duration: 230, + self_host_duration: 8, + self_device_duration: 0 + }, + { + name: 'aten::max_pool2d_with_indices_backward', + calls: 1, + host_duration: 72, + device_duration: 1503, + self_host_duration: 31, + self_device_duration: 1273 + }, + { + name: 'MaxPool2DWithIndicesBackward0', + calls: 1, + host_duration: 82, + device_duration: 1503, + self_host_duration: 10, + self_device_duration: 0 + }, + { + name: + 'autograd::engine::evaluate_function: MaxPool2DWithIndicesBackward0', + calls: 1, + host_duration: 103, + device_duration: 1503, + self_host_duration: 21, + self_device_duration: 0 + } + ] + }, + path: '0-9' + }, + { + left: { + name: 'aten::zeros', + duration: 154, + device_duration: 0, + total_duration: 154, + aggs: [ + { + name: 'aten::empty', + calls: 1, + host_duration: 75, + device_duration: 0, + self_host_duration: 75, + self_device_duration: 0 + }, + { + name: 'aten::zero_', + calls: 1, + host_duration: 4, + device_duration: 0, + self_host_duration: 4, + self_device_duration: 0 + }, + { + name: 'aten::zeros', + calls: 1, + host_duration: 154, + device_duration: 0, + self_host_duration: 75, + self_device_duration: 0 + } + ] + }, + right: { + name: 'aten::zeros', + duration: 42, + device_duration: 0, + total_duration: 42, + aggs: [ + { + name: 'aten::empty', + calls: 1, + host_duration: 32, + device_duration: 0, + self_host_duration: 32, + self_device_duration: 0 + }, + { + name: 'aten::zero_', + calls: 1, + host_duration: 1, + device_duration: 0, + self_host_duration: 1, + self_device_duration: 0 + }, + { + name: 'aten::zeros', + calls: 1, + host_duration: 42, + device_duration: 0, + self_host_duration: 9, + self_device_duration: 0 + } + ] + }, + path: '0-10' + }, + { + left: { + name: 'Optimizer.step#SGD.step', + duration: 75880, + device_duration: 1289, + total_duration: 1289, + aggs: [ + { + name: 'aten::empty', + calls: 1, + host_duration: 40, + device_duration: 0, + self_host_duration: 40, + self_device_duration: 0 + }, + { + name: 'aten::mul_', + calls: 161, + host_duration: 11873, + device_duration: 396, + self_host_duration: 9505, + self_device_duration: 396 + }, + { + name: 'aten::add_', + calls: 322, + host_duration: 22327, + device_duration: 893, + self_host_duration: 17668, + self_device_duration: 893 + } + ] + }, + right: { + name: 'Optimizer.step#SGD.step', + duration: 16441, + device_duration: 1305, + total_duration: 1305, + aggs: [ + { + name: 'aten::empty', + calls: 1, + host_duration: 6, + device_duration: 0, + self_host_duration: 6, + self_device_duration: 0 + }, + { + name: 'aten::mul_', + calls: 161, + host_duration: 3395, + device_duration: 399, + self_host_duration: 1806, + self_device_duration: 399 + }, + { + name: 'aten::add_', + calls: 322, + host_duration: 6217, + device_duration: 906, + self_host_duration: 3246, + self_device_duration: 906 + } + ] + }, + path: '0-11' + }, + { + left: { + name: 'multiple nodes', + duration: 145, + device_duration: 0, + total_duration: 145, + aggs: [ + { + name: 'aten::empty', + calls: 2, + host_duration: 79, + device_duration: 0, + self_host_duration: 79, + self_device_duration: 0 + }, + { + name: 'aten::zero_', + calls: 1, + host_duration: 4, + device_duration: 0, + self_host_duration: 4, + self_device_duration: 0 + }, + { + name: 'aten::zeros', + calls: 1, + host_duration: 106, + device_duration: 0, + self_host_duration: 62, + self_device_duration: 0 + } + ] + }, + right: { + name: 'multiple nodes', + duration: 15, + device_duration: 0, + total_duration: 15, + aggs: [ + { + name: 'aten::empty', + calls: 2, + host_duration: 10, + device_duration: 0, + self_host_duration: 10, + self_device_duration: 0 + }, + { + name: 'aten::zero_', + calls: 1, + host_duration: 0, + device_duration: 0, + self_host_duration: 0, + self_device_duration: 0 + }, + { + name: 'aten::zeros', + calls: 1, + host_duration: 9, + device_duration: 0, + self_host_duration: 5, + self_device_duration: 0 + } + ] + }, + path: '0-12' + }, + { + left: { + name: 'enumerate(DataLoader)#_SingleProcessDataLoaderIter.__next__', + duration: 1679463, + device_duration: 0, + total_duration: 1679463, + aggs: [ + { + name: 'aten::empty', + calls: 1413, + host_duration: 53837, + device_duration: 0, + self_host_duration: 53837, + self_device_duration: 0 + }, + { + name: 'aten::zero_', + calls: 257, + host_duration: 955, + device_duration: 0, + self_host_duration: 955, + self_device_duration: 0 + }, + { + name: 'aten::zeros', + calls: 257, + host_duration: 26673, + device_duration: 0, + self_host_duration: 16083, + self_device_duration: 0 + }, + { + name: 'aten::to', + calls: 1344, + host_duration: 824006, + device_duration: 0, + self_host_duration: 18525, + self_device_duration: 0 + }, + { + name: 'detach', + calls: 128, + host_duration: 2188, + device_duration: 0, + self_host_duration: 2188, + self_device_duration: 0 + }, + { + name: 'aten::detach', + calls: 128, + host_duration: 5295, + device_duration: 0, + self_host_duration: 3107, + self_device_duration: 0 + }, + { + name: 'aten::as_strided', + calls: 450, + host_duration: 4123, + device_duration: 0, + self_host_duration: 4123, + self_device_duration: 0 + }, + { + name: 'aten::unsqueeze', + calls: 192, + host_duration: 9590, + device_duration: 0, + self_host_duration: 8097, + self_device_duration: 0 + }, + { + name: 'aten::empty_strided', + calls: 576, + host_duration: 24764, + device_duration: 0, + self_host_duration: 24764, + self_device_duration: 0 + }, + { + name: 'aten::copy_', + calls: 704, + host_duration: 728608, + device_duration: 0, + self_host_duration: 728608, + self_device_duration: 0 + }, + { + name: 'aten::_to_copy', + calls: 640, + host_duration: 805481, + device_duration: 0, + self_host_duration: 51350, + self_device_duration: 0 + }, + { + name: 'aten::upsample_bilinear2d', + calls: 64, + host_duration: 236448, + device_duration: 0, + self_host_duration: 216887, + self_device_duration: 0 + }, + { + name: 'aten::squeeze', + calls: 64, + host_duration: 4682, + device_duration: 0, + self_host_duration: 4092, + self_device_duration: 0 + }, + { + name: 'aten::round', + calls: 64, + host_duration: 15283, + device_duration: 0, + self_host_duration: 15283, + self_device_duration: 0 + }, + { + name: 'aten::slice', + calls: 130, + host_duration: 8844, + device_duration: 0, + self_host_duration: 7513, + self_device_duration: 0 + }, + { + name: 'detach_', + calls: 256, + host_duration: 2102, + device_duration: 0, + self_host_duration: 2102, + self_device_duration: 0 + }, + { + name: 'aten::detach_', + calls: 256, + host_duration: 7286, + device_duration: 0, + self_host_duration: 5184, + self_device_duration: 0 + }, + { + name: 'aten::result_type', + calls: 320, + host_duration: 850, + device_duration: 0, + self_host_duration: 850, + self_device_duration: 0 + }, + { + name: 'aten::pow', + calls: 320, + host_duration: 43219, + device_duration: 0, + self_host_duration: 39305, + self_device_duration: 0 + }, + { + name: 'aten::sub', + calls: 320, + host_duration: 92093, + device_duration: 0, + self_host_duration: 37961, + self_device_duration: 0 + }, + { + name: 'aten::gt', + calls: 320, + host_duration: 35770, + device_duration: 0, + self_host_duration: 24869, + self_device_duration: 0 + }, + { + name: 'aten::_local_scalar_dense', + calls: 384, + host_duration: 2481, + device_duration: 0, + self_host_duration: 2481, + self_device_duration: 0 + }, + { + name: 'aten::item', + calls: 384, + host_duration: 10547, + device_duration: 0, + self_host_duration: 8066, + self_device_duration: 0 + }, + { + name: 'aten::is_nonzero', + calls: 320, + host_duration: 14029, + device_duration: 0, + self_host_duration: 5364, + self_device_duration: 0 + }, + { + name: 'aten::div', + calls: 64, + host_duration: 79760, + device_duration: 0, + self_host_duration: 68841, + self_device_duration: 0 + }, + { + name: 'aten::resize_', + calls: 2, + host_duration: 121, + device_duration: 0, + self_host_duration: 121, + self_device_duration: 0 + }, + { + name: 'aten::narrow', + calls: 2, + host_duration: 138, + device_duration: 0, + self_host_duration: 48, + self_device_duration: 0 + }, + { + name: 'aten::_cat', + calls: 2, + host_duration: 41467, + device_duration: 0, + self_host_duration: 41176, + self_device_duration: 0 + }, + { + name: 'aten::cat', + calls: 2, + host_duration: 41608, + device_duration: 0, + self_host_duration: 141, + self_device_duration: 0 + }, + { + name: 'aten::stack', + calls: 2, + host_duration: 49080, + device_duration: 0, + self_host_duration: 2720, + self_device_duration: 0 + } + ] + }, + right: { + name: 'enumerate(DataLoader)#_SingleProcessDataLoaderIter.__next__', + duration: 123490, + device_duration: 0, + total_duration: 123490, + aggs: [ + { + name: 'aten::empty', + calls: 1413, + host_duration: 6528, + device_duration: 0, + self_host_duration: 6528, + self_device_duration: 0 + }, + { + name: 'aten::zero_', + calls: 257, + host_duration: 94, + device_duration: 0, + self_host_duration: 94, + self_device_duration: 0 + }, + { + name: 'aten::zeros', + calls: 257, + host_duration: 2448, + device_duration: 0, + self_host_duration: 1214, + self_device_duration: 0 + }, + { + name: 'aten::to', + calls: 1344, + host_duration: 16544, + device_duration: 0, + self_host_duration: 1856, + self_device_duration: 0 + }, + { + name: 'detach', + calls: 128, + host_duration: 337, + device_duration: 0, + self_host_duration: 337, + self_device_duration: 0 + }, + { + name: 'aten::detach', + calls: 128, + host_duration: 629, + device_duration: 0, + self_host_duration: 292, + self_device_duration: 0 + }, + { + name: 'aten::as_strided', + calls: 450, + host_duration: 464, + device_duration: 0, + self_host_duration: 464, + self_device_duration: 0 + }, + { + name: 'aten::unsqueeze', + calls: 192, + host_duration: 1024, + device_duration: 0, + self_host_duration: 854, + self_device_duration: 0 + }, + { + name: 'aten::empty_strided', + calls: 576, + host_duration: 3009, + device_duration: 0, + self_host_duration: 3009, + self_device_duration: 0 + }, + { + name: 'aten::copy_', + calls: 704, + host_duration: 7419, + device_duration: 0, + self_host_duration: 7419, + self_device_duration: 0 + }, + { + name: 'aten::_to_copy', + calls: 640, + host_duration: 14688, + device_duration: 0, + self_host_duration: 4039, + self_device_duration: 0 + }, + { + name: 'aten::upsample_bilinear2d', + calls: 64, + host_duration: 31439, + device_duration: 0, + self_host_duration: 29154, + self_device_duration: 0 + }, + { + name: 'aten::squeeze', + calls: 64, + host_duration: 473, + device_duration: 0, + self_host_duration: 408, + self_device_duration: 0 + }, + { + name: 'aten::round', + calls: 64, + host_duration: 4416, + device_duration: 0, + self_host_duration: 4416, + self_device_duration: 0 + }, + { + name: 'aten::slice', + calls: 130, + host_duration: 864, + device_duration: 0, + self_host_duration: 730, + self_device_duration: 0 + }, + { + name: 'detach_', + calls: 256, + host_duration: 136, + device_duration: 0, + self_host_duration: 115, + self_device_duration: 0 + }, + { + name: 'aten::detach_', + calls: 256, + host_duration: 586, + device_duration: 0, + self_host_duration: 471, + self_device_duration: 0 + }, + { + name: 'aten::result_type', + calls: 320, + host_duration: 149, + device_duration: 0, + self_host_duration: 149, + self_device_duration: 0 + }, + { + name: 'aten::pow', + calls: 320, + host_duration: 3935, + device_duration: 0, + self_host_duration: 3519, + self_device_duration: 0 + }, + { + name: 'aten::sub', + calls: 320, + host_duration: 7881, + device_duration: 0, + self_host_duration: 3349, + self_device_duration: 0 + }, + { + name: 'aten::gt', + calls: 320, + host_duration: 3055, + device_duration: 0, + self_host_duration: 2164, + self_device_duration: 0 + }, + { + name: 'aten::_local_scalar_dense', + calls: 384, + host_duration: 186, + device_duration: 0, + self_host_duration: 186, + self_device_duration: 0 + }, + { + name: 'aten::item', + calls: 384, + host_duration: 1134, + device_duration: 0, + self_host_duration: 943, + self_device_duration: 0 + }, + { + name: 'aten::is_nonzero', + calls: 320, + host_duration: 1588, + device_duration: 0, + self_host_duration: 615, + self_device_duration: 0 + }, + { + name: 'aten::div', + calls: 64, + host_duration: 4153, + device_duration: 0, + self_host_duration: 3203, + self_device_duration: 0 + }, + { + name: 'aten::resize_', + calls: 2, + host_duration: 42, + device_duration: 0, + self_host_duration: 42, + self_device_duration: 0 + }, + { + name: 'aten::narrow', + calls: 2, + host_duration: 18, + device_duration: 0, + self_host_duration: 7, + self_device_duration: 0 + }, + { + name: 'aten::_cat', + calls: 2, + host_duration: 4613, + device_duration: 0, + self_host_duration: 4547, + self_device_duration: 0 + }, + { + name: 'aten::cat', + calls: 2, + host_duration: 4637, + device_duration: 0, + self_host_duration: 24, + self_device_duration: 0 + }, + { + name: 'aten::stack', + calls: 2, + host_duration: 5311, + device_duration: 0, + self_host_duration: 246, + self_device_duration: 0 + } + ] + }, + path: '0-13' + }, + { + left: { + name: 'multiple nodes', + duration: 5185, + device_duration: 4394, + total_duration: 4394, + aggs: [ + { + name: 'aten::empty_strided', + calls: 2, + host_duration: 203, + device_duration: 0, + self_host_duration: 203, + self_device_duration: 0 + }, + { + name: 'aten::copy_', + calls: 2, + host_duration: 4687, + device_duration: 4394, + self_host_duration: 94, + self_device_duration: 4394 + }, + { + name: 'aten::_to_copy', + calls: 2, + host_duration: 5113, + device_duration: 4394, + self_host_duration: 223, + self_device_duration: 0 + }, + { + name: 'aten::to', + calls: 2, + host_duration: 5185, + device_duration: 4394, + self_host_duration: 72, + self_device_duration: 0 + } + ] + }, + right: { + name: 'multiple nodes', + duration: 4664, + device_duration: 4334, + total_duration: 4334, + aggs: [ + { + name: 'aten::empty_strided', + calls: 2, + host_duration: 60, + device_duration: 0, + self_host_duration: 60, + self_device_duration: 0 + }, + { + name: 'aten::copy_', + calls: 2, + host_duration: 4559, + device_duration: 4334, + self_host_duration: 26, + self_device_duration: 4334 + }, + { + name: 'aten::_to_copy', + calls: 2, + host_duration: 4655, + device_duration: 4334, + self_host_duration: 36, + self_device_duration: 0 + }, + { + name: 'aten::to', + calls: 2, + host_duration: 4664, + device_duration: 4334, + self_host_duration: 9, + self_device_duration: 0 + } + ] + }, + path: '0-14' + }, + { + left: { + name: 'nn.Module: ResNet', + duration: 112761, + device_duration: 59848, + total_duration: 59848, + aggs: [ + { + name: 'aten::empty', + calls: 318, + host_duration: 13992, + device_duration: 0, + self_host_duration: 13992, + self_device_duration: 0 + }, + { + name: 'aten::cudnn_convolution', + calls: 53, + host_duration: 21952, + device_duration: 35233, + self_host_duration: 17460, + self_device_duration: 35233 + }, + { + name: 'aten::_convolution', + calls: 53, + host_duration: 25568, + device_duration: 35233, + self_host_duration: 3616, + self_device_duration: 0 + }, + { + name: 'aten::convolution', + calls: 53, + host_duration: 27534, + device_duration: 35233, + self_host_duration: 1966, + self_device_duration: 0 + }, + { + name: 'aten::conv2d', + calls: 53, + host_duration: 29546, + device_duration: 35233, + self_host_duration: 2012, + self_device_duration: 0 + }, + { + name: 'aten::add', + calls: 53, + host_duration: 6523, + device_duration: 53, + self_host_duration: 5669, + self_device_duration: 53 + }, + { + name: 'aten::empty_like', + calls: 53, + host_duration: 5605, + device_duration: 0, + self_host_duration: 2378, + self_device_duration: 0 + }, + { + name: 'aten::view', + calls: 53, + host_duration: 829, + device_duration: 0, + self_host_duration: 829, + self_device_duration: 0 + }, + { + name: 'aten::cudnn_batch_norm', + calls: 53, + host_duration: 35510, + device_duration: 12828, + self_host_duration: 20387, + self_device_duration: 12828 + }, + { + name: 'aten::_batch_norm_impl_index', + calls: 53, + host_duration: 38030, + device_duration: 12828, + self_host_duration: 2520, + self_device_duration: 0 + }, + { + name: 'aten::batch_norm', + calls: 53, + host_duration: 39727, + device_duration: 12828, + self_host_duration: 1697, + self_device_duration: 0 + }, + { + name: 'aten::clamp_min', + calls: 49, + host_duration: 2715, + device_duration: 5998, + self_host_duration: 1950, + self_device_duration: 5998 + }, + { + name: 'aten::clamp_min_', + calls: 49, + host_duration: 4264, + device_duration: 5998, + self_host_duration: 1549, + self_device_duration: 0 + }, + { + name: 'aten::relu_', + calls: 49, + host_duration: 8337, + device_duration: 5998, + self_host_duration: 4073, + self_device_duration: 0 + }, + { + name: 'aten::max_pool2d_with_indices', + calls: 1, + host_duration: 212, + device_duration: 466, + self_host_duration: 193, + self_device_duration: 466 + }, + { + name: 'aten::max_pool2d', + calls: 1, + host_duration: 262, + device_duration: 466, + self_host_duration: 50, + self_device_duration: 0 + }, + { + name: 'aten::add_', + calls: 16, + host_duration: 1553, + device_duration: 5165, + self_host_duration: 1297, + self_device_duration: 5165 + }, + { + name: 'aten::mean', + calls: 1, + host_duration: 187, + device_duration: 64, + self_host_duration: 169, + self_device_duration: 64 + }, + { + name: 'aten::adaptive_avg_pool2d', + calls: 1, + host_duration: 231, + device_duration: 64, + self_host_duration: 44, + self_device_duration: 0 + }, + { + name: 'aten::_reshape_alias', + calls: 1, + host_duration: 52, + device_duration: 0, + self_host_duration: 52, + self_device_duration: 0 + }, + { + name: 'aten::flatten', + calls: 1, + host_duration: 101, + device_duration: 0, + self_host_duration: 49, + self_device_duration: 0 + }, + { + name: 'aten::as_strided', + calls: 2, + host_duration: 21, + device_duration: 0, + self_host_duration: 21, + self_device_duration: 0 + }, + { + name: 'aten::transpose', + calls: 1, + host_duration: 51, + device_duration: 0, + self_host_duration: 40, + self_device_duration: 0 + }, + { + name: 'aten::t', + calls: 1, + host_duration: 120, + device_duration: 0, + self_host_duration: 69, + self_device_duration: 0 + }, + { + name: 'aten::expand', + calls: 1, + host_duration: 49, + device_duration: 0, + self_host_duration: 39, + self_device_duration: 0 + }, + { + name: 'aten::addmm', + calls: 1, + host_duration: 405, + device_duration: 41, + self_host_duration: 302, + self_device_duration: 41 + }, + { + name: 'aten::linear', + calls: 1, + host_duration: 594, + device_duration: 41, + self_host_duration: 69, + self_device_duration: 0 + } + ] + }, + right: { + name: 'nn.Module: ResNet', + duration: 28459, + device_duration: 59832, + total_duration: 59832, + aggs: [ + { + name: 'aten::empty', + calls: 318, + host_duration: 2234, + device_duration: 0, + self_host_duration: 2234, + self_device_duration: 0 + }, + { + name: 'aten::cudnn_convolution', + calls: 53, + host_duration: 8644, + device_duration: 35209, + self_host_duration: 6782, + self_device_duration: 35209 + }, + { + name: 'aten::_convolution', + calls: 53, + host_duration: 9216, + device_duration: 35209, + self_host_duration: 572, + self_device_duration: 0 + }, + { + name: 'aten::convolution', + calls: 53, + host_duration: 9532, + device_duration: 35209, + self_host_duration: 316, + self_device_duration: 0 + }, + { + name: 'aten::conv2d', + calls: 53, + host_duration: 9818, + device_duration: 35209, + self_host_duration: 286, + self_device_duration: 0 + }, + { + name: 'aten::add', + calls: 53, + host_duration: 1898, + device_duration: 55, + self_host_duration: 1202, + self_device_duration: 55 + }, + { + name: 'aten::empty_like', + calls: 53, + host_duration: 941, + device_duration: 0, + self_host_duration: 300, + self_device_duration: 0 + }, + { + name: 'aten::view', + calls: 53, + host_duration: 137, + device_duration: 0, + self_host_duration: 137, + self_device_duration: 0 + }, + { + name: 'aten::cudnn_batch_norm', + calls: 53, + host_duration: 5543, + device_duration: 12824, + self_host_duration: 2527, + self_device_duration: 12824 + }, + { + name: 'aten::_batch_norm_impl_index', + calls: 53, + host_duration: 5914, + device_duration: 12824, + self_host_duration: 371, + self_device_duration: 0 + }, + { + name: 'aten::batch_norm', + calls: 53, + host_duration: 6167, + device_duration: 12824, + self_host_duration: 253, + self_device_duration: 0 + }, + { + name: 'aten::clamp_min', + calls: 49, + host_duration: 1081, + device_duration: 6004, + self_host_duration: 507, + self_device_duration: 6004 + }, + { + name: 'aten::clamp_min_', + calls: 49, + host_duration: 1299, + device_duration: 6004, + self_host_duration: 218, + self_device_duration: 0 + }, + { + name: 'aten::relu_', + calls: 49, + host_duration: 1941, + device_duration: 6004, + self_host_duration: 642, + self_device_duration: 0 + }, + { + name: 'aten::max_pool2d_with_indices', + calls: 1, + host_duration: 59, + device_duration: 466, + self_host_duration: 44, + self_device_duration: 466 + }, + { + name: 'aten::max_pool2d', + calls: 1, + host_duration: 66, + device_duration: 466, + self_host_duration: 7, + self_device_duration: 0 + }, + { + name: 'aten::add_', + calls: 16, + host_duration: 443, + device_duration: 5169, + self_host_duration: 267, + self_device_duration: 5169 + }, + { + name: 'aten::mean', + calls: 1, + host_duration: 51, + device_duration: 63, + self_host_duration: 37, + self_device_duration: 63 + }, + { + name: 'aten::adaptive_avg_pool2d', + calls: 1, + host_duration: 58, + device_duration: 63, + self_host_duration: 7, + self_device_duration: 0 + }, + { + name: 'aten::_reshape_alias', + calls: 1, + host_duration: 8, + device_duration: 0, + self_host_duration: 8, + self_device_duration: 0 + }, + { + name: 'aten::flatten', + calls: 1, + host_duration: 16, + device_duration: 0, + self_host_duration: 8, + self_device_duration: 0 + }, + { + name: 'aten::as_strided', + calls: 2, + host_duration: 3, + device_duration: 0, + self_host_duration: 3, + self_device_duration: 0 + }, + { + name: 'aten::transpose', + calls: 1, + host_duration: 10, + device_duration: 0, + self_host_duration: 8, + self_device_duration: 0 + }, + { + name: 'aten::t', + calls: 1, + host_duration: 18, + device_duration: 0, + self_host_duration: 8, + self_device_duration: 0 + }, + { + name: 'aten::expand', + calls: 1, + host_duration: 5, + device_duration: 0, + self_host_duration: 4, + self_device_duration: 0 + }, + { + name: 'aten::addmm', + calls: 1, + host_duration: 161, + device_duration: 42, + self_host_duration: 111, + self_device_duration: 42 + }, + { + name: 'aten::linear', + calls: 1, + host_duration: 188, + device_duration: 42, + self_host_duration: 9, + self_device_duration: 0 + } + ] + }, + path: '0-15' + }, + { + left: { + name: 'nn.Module: CrossEntropyLoss', + duration: 712, + device_duration: 11, + total_duration: 11, + aggs: [ + { + name: 'aten::to', + calls: 1, + host_duration: 6, + device_duration: 0, + self_host_duration: 6, + self_device_duration: 0 + }, + { + name: 'aten::_log_softmax', + calls: 1, + host_duration: 150, + device_duration: 7, + self_host_duration: 132, + self_device_duration: 7 + }, + { + name: 'aten::log_softmax', + calls: 1, + host_duration: 231, + device_duration: 7, + self_host_duration: 75, + self_device_duration: 0 + }, + { + name: 'aten::resize_', + calls: 1, + host_duration: 5, + device_duration: 0, + self_host_duration: 5, + self_device_duration: 0 + }, + { + name: 'aten::nll_loss_forward', + calls: 1, + host_duration: 266, + device_duration: 4, + self_host_duration: 243, + self_device_duration: 4 + }, + { + name: 'aten::nll_loss', + calls: 1, + host_duration: 300, + device_duration: 4, + self_host_duration: 34, + self_device_duration: 0 + }, + { + name: 'aten::nll_loss_nd', + calls: 1, + host_duration: 328, + device_duration: 4, + self_host_duration: 28, + self_device_duration: 0 + }, + { + name: 'aten::cross_entropy_loss', + calls: 1, + host_duration: 620, + device_duration: 11, + self_host_duration: 61, + self_device_duration: 0 + } + ] + }, + right: { + name: 'nn.Module: CrossEntropyLoss', + duration: 156, + device_duration: 11, + total_duration: 11, + aggs: [ + { + name: 'aten::to', + calls: 1, + host_duration: 1, + device_duration: 0, + self_host_duration: 1, + self_device_duration: 0 + }, + { + name: 'aten::_log_softmax', + calls: 1, + host_duration: 41, + device_duration: 7, + self_host_duration: 27, + self_device_duration: 7 + }, + { + name: 'aten::log_softmax', + calls: 1, + host_duration: 52, + device_duration: 7, + self_host_duration: 10, + self_device_duration: 0 + }, + { + name: 'aten::resize_', + calls: 1, + host_duration: 1, + device_duration: 0, + self_host_duration: 1, + self_device_duration: 0 + }, + { + name: 'aten::nll_loss_forward', + calls: 1, + host_duration: 49, + device_duration: 4, + self_host_duration: 34, + self_device_duration: 4 + }, + { + name: 'aten::nll_loss', + calls: 1, + host_duration: 53, + device_duration: 4, + self_host_duration: 4, + self_device_duration: 0 + }, + { + name: 'aten::nll_loss_nd', + calls: 1, + host_duration: 57, + device_duration: 4, + self_host_duration: 4, + self_device_duration: 0 + }, + { + name: 'aten::cross_entropy_loss', + calls: 1, + host_duration: 124, + device_duration: 11, + self_host_duration: 15, + self_device_duration: 0 + } + ] + }, + path: '0-16' + }, + { + left: { + name: 'aten::zeros', + duration: 109, + device_duration: 0, + total_duration: 109, + aggs: [ + { + name: 'aten::empty', + calls: 1, + host_duration: 39, + device_duration: 0, + self_host_duration: 39, + self_device_duration: 0 + }, + { + name: 'aten::zero_', + calls: 1, + host_duration: 5, + device_duration: 0, + self_host_duration: 5, + self_device_duration: 0 + }, + { + name: 'aten::zeros', + calls: 1, + host_duration: 109, + device_duration: 0, + self_host_duration: 65, + self_device_duration: 0 + } + ] + }, + right: { + name: 'aten::zeros', + duration: 23, + device_duration: 0, + total_duration: 23, + aggs: [ + { + name: 'aten::empty', + calls: 1, + host_duration: 13, + device_duration: 0, + self_host_duration: 13, + self_device_duration: 0 + }, + { + name: 'aten::zero_', + calls: 1, + host_duration: 1, + device_duration: 0, + self_host_duration: 1, + self_device_duration: 0 + }, + { + name: 'aten::zeros', + calls: 1, + host_duration: 23, + device_duration: 0, + self_host_duration: 9, + self_device_duration: 0 + } + ] + }, + path: '0-17' + }, + { + left: { + name: 'Optimizer.zero_grad#SGD.zero_grad', + duration: 24374, + device_duration: 132, + total_duration: 132, + aggs: [ + { + name: 'aten::empty', + calls: 1, + host_duration: 44, + device_duration: 0, + self_host_duration: 44, + self_device_duration: 0 + }, + { + name: 'aten::fill_', + calls: 161, + host_duration: 7104, + device_duration: 132, + self_host_duration: 4941, + self_device_duration: 132 + }, + { + name: 'aten::zero_', + calls: 161, + host_duration: 14806, + device_duration: 132, + self_host_duration: 7702, + self_device_duration: 0 + } + ] + }, + right: { + name: 'Optimizer.zero_grad#SGD.zero_grad', + duration: 4461, + device_duration: 137, + total_duration: 137, + aggs: [ + { + name: 'aten::empty', + calls: 1, + host_duration: 6, + device_duration: 0, + self_host_duration: 6, + self_device_duration: 0 + }, + { + name: 'aten::fill_', + calls: 161, + host_duration: 1945, + device_duration: 137, + self_host_duration: 878, + self_device_duration: 137 + }, + { + name: 'aten::zero_', + calls: 161, + host_duration: 2805, + device_duration: 137, + self_host_duration: 860, + self_device_duration: 0 + } + ] + }, + path: '0-18' + }, + { + left: { + name: 'aten::ones_like', + duration: 263, + device_duration: 1, + total_duration: 1, + aggs: [ + { + name: 'aten::empty_strided', + calls: 1, + host_duration: 99, + device_duration: 0, + self_host_duration: 99, + self_device_duration: 0 + }, + { + name: 'aten::empty_like', + calls: 1, + host_duration: 149, + device_duration: 0, + self_host_duration: 50, + self_device_duration: 0 + }, + { + name: 'aten::fill_', + calls: 1, + host_duration: 49, + device_duration: 1, + self_host_duration: 34, + self_device_duration: 1 + }, + { + name: 'aten::ones_like', + calls: 1, + host_duration: 263, + device_duration: 1, + self_host_duration: 65, + self_device_duration: 0 + } + ] + }, + right: { + name: 'aten::ones_like', + duration: 51, + device_duration: 1, + total_duration: 1, + aggs: [ + { + name: 'aten::empty_strided', + calls: 1, + host_duration: 18, + device_duration: 0, + self_host_duration: 18, + self_device_duration: 0 + }, + { + name: 'aten::empty_like', + calls: 1, + host_duration: 24, + device_duration: 0, + self_host_duration: 6, + self_device_duration: 0 + }, + { + name: 'aten::fill_', + calls: 1, + host_duration: 20, + device_duration: 1, + self_host_duration: 8, + self_device_duration: 1 + }, + { + name: 'aten::ones_like', + calls: 1, + host_duration: 51, + device_duration: 1, + self_host_duration: 7, + self_device_duration: 0 + } + ] + }, + path: '0-19' + }, + { + left: { + name: 'nn.Module: CrossEntropyLoss.backward', + duration: 845, + device_duration: 13, + total_duration: 13, + aggs: [ + { + name: 'aten::fill_', + calls: 1, + host_duration: 58, + device_duration: 1, + self_host_duration: 36, + self_device_duration: 1 + }, + { + name: 'aten::zero_', + calls: 1, + host_duration: 112, + device_duration: 1, + self_host_duration: 54, + self_device_duration: 0 + }, + { + name: 'aten::nll_loss_backward', + calls: 1, + host_duration: 269, + device_duration: 4, + self_host_duration: 142, + self_device_duration: 3 + }, + { + name: 'NllLossBackward0', + calls: 1, + host_duration: 406, + device_duration: 4, + self_host_duration: 137, + self_device_duration: 0 + }, + { + name: 'autograd::engine::evaluate_function: NllLossBackward0', + calls: 1, + host_duration: 522, + device_duration: 4, + self_host_duration: 116, + self_device_duration: 0 + }, + { + name: 'aten::_log_softmax_backward_data', + calls: 1, + host_duration: 109, + device_duration: 9, + self_host_duration: 91, + self_device_duration: 9 + }, + { + name: 'LogSoftmaxBackward0', + calls: 1, + host_duration: 178, + device_duration: 9, + self_host_duration: 69, + self_device_duration: 0 + }, + { + name: + 'autograd::engine::evaluate_function: LogSoftmaxBackward0', + calls: 1, + host_duration: 283, + device_duration: 9, + self_host_duration: 105, + self_device_duration: 0 + } + ] + }, + right: { + name: 'nn.Module: CrossEntropyLoss.backward', + duration: 283, + device_duration: 13, + total_duration: 13, + aggs: [ + { + name: 'aten::fill_', + calls: 1, + host_duration: 33, + device_duration: 1, + self_host_duration: 12, + self_device_duration: 1 + }, + { + name: 'aten::zero_', + calls: 1, + host_duration: 41, + device_duration: 1, + self_host_duration: 8, + self_device_duration: 0 + }, + { + name: 'aten::nll_loss_backward', + calls: 1, + host_duration: 93, + device_duration: 4, + self_host_duration: 41, + self_device_duration: 3 + }, + { + name: 'NllLossBackward0', + calls: 1, + host_duration: 185, + device_duration: 4, + self_host_duration: 92, + self_device_duration: 0 + }, + { + name: 'autograd::engine::evaluate_function: NllLossBackward0', + calls: 1, + host_duration: 211, + device_duration: 4, + self_host_duration: 26, + self_device_duration: 0 + }, + { + name: 'aten::_log_softmax_backward_data', + calls: 1, + host_duration: 36, + device_duration: 9, + self_host_duration: 22, + self_device_duration: 9 + }, + { + name: 'LogSoftmaxBackward0', + calls: 1, + host_duration: 45, + device_duration: 9, + self_host_duration: 9, + self_device_duration: 0 + }, + { + name: + 'autograd::engine::evaluate_function: LogSoftmaxBackward0', + calls: 1, + host_duration: 62, + device_duration: 9, + self_host_duration: 17, + self_device_duration: 0 + } + ] + }, + path: '0-20' + }, + { + left: { + name: 'nn.Module: ResNet.backward', + duration: 180218, + device_duration: 120676, + total_duration: 120676, + aggs: [ + { + name: 'aten::as_strided', + calls: 5, + host_duration: 67, + device_duration: 0, + self_host_duration: 67, + self_device_duration: 0 + }, + { + name: 'aten::transpose', + calls: 4, + host_duration: 255, + device_duration: 0, + self_host_duration: 204, + self_device_duration: 0 + }, + { + name: 'aten::t', + calls: 4, + host_duration: 430, + device_duration: 0, + self_host_duration: 175, + self_device_duration: 0 + }, + { + name: 'aten::mm', + calls: 2, + host_duration: 323, + device_duration: 68, + self_host_duration: 265, + self_device_duration: 68 + }, + { + name: 'AddmmBackward0', + calls: 1, + host_duration: 844, + device_duration: 68, + self_host_duration: 209, + self_device_duration: 0 + }, + { + name: 'aten::sum', + calls: 1, + host_duration: 197, + device_duration: 7, + self_host_duration: 175, + self_device_duration: 7 + }, + { + name: 'aten::view', + calls: 54, + host_duration: 963, + device_duration: 0, + self_host_duration: 963, + self_device_duration: 0 + }, + { + name: 'autograd::engine::evaluate_function: AddmmBackward0', + calls: 1, + host_duration: 1377, + device_duration: 75, + self_host_duration: 296, + self_device_duration: 0 + }, + { + name: 'aten::add_', + calls: 161, + host_duration: 12404, + device_duration: 496, + self_host_duration: 9659, + self_device_duration: 496 + }, + { + name: 'torch::autograd::AccumulateGrad', + calls: 161, + host_duration: 20417, + device_duration: 496, + self_host_duration: 8013, + self_device_duration: 0 + }, + { + name: + 'autograd::engine::evaluate_function: torch::autograd::AccumulateGrad', + calls: 161, + host_duration: 35211, + device_duration: 496, + self_host_duration: 14794, + self_device_duration: 0 + }, + { + name: 'TBackward0', + calls: 1, + host_duration: 152, + device_duration: 0, + self_host_duration: 34, + self_device_duration: 0 + }, + { + name: 'autograd::engine::evaluate_function: TBackward0', + calls: 1, + host_duration: 231, + device_duration: 0, + self_host_duration: 79, + self_device_duration: 0 + }, + { + name: 'aten::_reshape_alias', + calls: 1, + host_duration: 35, + device_duration: 0, + self_host_duration: 35, + self_device_duration: 0 + }, + { + name: 'aten::reshape', + calls: 1, + host_duration: 91, + device_duration: 0, + self_host_duration: 56, + self_device_duration: 0 + }, + { + name: 'ReshapeAliasBackward0', + calls: 1, + host_duration: 133, + device_duration: 0, + self_host_duration: 42, + self_device_duration: 0 + }, + { + name: + 'autograd::engine::evaluate_function: ReshapeAliasBackward0', + calls: 1, + host_duration: 205, + device_duration: 0, + self_host_duration: 72, + self_device_duration: 0 + }, + { + name: 'aten::expand', + calls: 1, + host_duration: 95, + device_duration: 0, + self_host_duration: 79, + self_device_duration: 0 + }, + { + name: 'aten::to', + calls: 1, + host_duration: 7, + device_duration: 0, + self_host_duration: 7, + self_device_duration: 0 + }, + { + name: 'aten::div', + calls: 1, + host_duration: 324, + device_duration: 37, + self_host_duration: 301, + self_device_duration: 37 + }, + { + name: 'MeanBackward1', + calls: 1, + host_duration: 547, + device_duration: 37, + self_host_duration: 121, + self_device_duration: 0 + }, + { + name: 'autograd::engine::evaluate_function: MeanBackward1', + calls: 1, + host_duration: 662, + device_duration: 37, + self_host_duration: 115, + self_device_duration: 0 + }, + { + name: 'aten::threshold_backward', + calls: 49, + host_duration: 6880, + device_duration: 9012, + self_host_duration: 6037, + self_device_duration: 9012 + }, + { + name: 'ReluBackward0', + calls: 49, + host_duration: 10536, + device_duration: 9012, + self_host_duration: 3656, + self_device_duration: 0 + }, + { + name: 'autograd::engine::evaluate_function: ReluBackward0', + calls: 49, + host_duration: 16666, + device_duration: 9012, + self_host_duration: 6130, + self_device_duration: 0 + }, + { + name: 'AddBackward0', + calls: 16, + host_duration: 122, + device_duration: 0, + self_host_duration: 122, + self_device_duration: 0 + }, + { + name: 'autograd::engine::evaluate_function: AddBackward0', + calls: 16, + host_duration: 1278, + device_duration: 0, + self_host_duration: 1156, + self_device_duration: 0 + }, + { + name: 'aten::empty', + calls: 370, + host_duration: 21126, + device_duration: 0, + self_host_duration: 21126, + self_device_duration: 0 + }, + { + name: 'aten::cudnn_batch_norm_backward', + calls: 53, + host_duration: 30875, + device_duration: 22166, + self_host_duration: 17909, + self_device_duration: 22166 + }, + { + name: 'CudnnBatchNormBackward0', + calls: 53, + host_duration: 34355, + device_duration: 22166, + self_host_duration: 3480, + self_device_duration: 0 + }, + { + name: + 'autograd::engine::evaluate_function: CudnnBatchNormBackward0', + calls: 53, + host_duration: 44006, + device_duration: 22166, + self_host_duration: 9651, + self_device_duration: 0 + }, + { + name: 'aten::cudnn_convolution_backward_input', + calls: 52, + host_duration: 20496, + device_duration: 37887, + self_host_duration: 15516, + self_device_duration: 37887 + }, + { + name: 'aten::cudnn_convolution_backward_weight', + calls: 53, + host_duration: 22878, + device_duration: 44271, + self_host_duration: 13672, + self_device_duration: 44271 + }, + { + name: 'aten::cudnn_convolution_backward', + calls: 53, + host_duration: 50961, + device_duration: 82158, + self_host_duration: 7587, + self_device_duration: 0 + }, + { + name: 'CudnnConvolutionBackward0', + calls: 53, + host_duration: 54406, + device_duration: 82158, + self_host_duration: 3445, + self_device_duration: 0 + }, + { + name: + 'autograd::engine::evaluate_function: CudnnConvolutionBackward0', + calls: 53, + host_duration: 64877, + device_duration: 87386, + self_host_duration: 8284, + self_device_duration: 0 + }, + { + name: 'aten::add', + calls: 16, + host_duration: 2187, + device_duration: 5228, + self_host_duration: 1909, + self_device_duration: 5228 + }, + { + name: 'aten::fill_', + calls: 1, + host_duration: 53, + device_duration: 230, + self_host_duration: 36, + self_device_duration: 230 + }, + { + name: 'aten::zero_', + calls: 1, + host_duration: 96, + device_duration: 230, + self_host_duration: 43, + self_device_duration: 0 + }, + { + name: 'aten::max_pool2d_with_indices_backward', + calls: 1, + host_duration: 237, + device_duration: 1504, + self_host_duration: 129, + self_device_duration: 1274 + }, + { + name: 'MaxPool2DWithIndicesBackward0', + calls: 1, + host_duration: 295, + device_duration: 1504, + self_host_duration: 58, + self_device_duration: 0 + }, + { + name: + 'autograd::engine::evaluate_function: MaxPool2DWithIndicesBackward0', + calls: 1, + host_duration: 411, + device_duration: 1504, + self_host_duration: 116, + self_device_duration: 0 + } + ] + }, + right: { + name: 'nn.Module: ResNet.backward', + duration: 45132, + device_duration: 121137, + total_duration: 121137, + aggs: [ + { + name: 'aten::as_strided', + calls: 5, + host_duration: 7, + device_duration: 0, + self_host_duration: 7, + self_device_duration: 0 + }, + { + name: 'aten::transpose', + calls: 4, + host_duration: 29, + device_duration: 0, + self_host_duration: 23, + self_device_duration: 0 + }, + { + name: 'aten::t', + calls: 4, + host_duration: 53, + device_duration: 0, + self_host_duration: 24, + self_device_duration: 0 + }, + { + name: 'aten::mm', + calls: 2, + host_duration: 144, + device_duration: 67, + self_host_duration: 96, + self_device_duration: 67 + }, + { + name: 'AddmmBackward0', + calls: 1, + host_duration: 208, + device_duration: 67, + self_host_duration: 24, + self_device_duration: 0 + }, + { + name: 'aten::sum', + calls: 1, + host_duration: 45, + device_duration: 7, + self_host_duration: 30, + self_device_duration: 7 + }, + { + name: 'aten::view', + calls: 54, + host_duration: 163, + device_duration: 0, + self_host_duration: 163, + self_device_duration: 0 + }, + { + name: 'autograd::engine::evaluate_function: AddmmBackward0', + calls: 1, + host_duration: 295, + device_duration: 74, + self_host_duration: 38, + self_device_duration: 0 + }, + { + name: 'aten::add_', + calls: 161, + host_duration: 4103, + device_duration: 535, + self_host_duration: 2037, + self_device_duration: 535 + }, + { + name: 'torch::autograd::AccumulateGrad', + calls: 161, + host_duration: 5183, + device_duration: 535, + self_host_duration: 1080, + self_device_duration: 0 + }, + { + name: + 'autograd::engine::evaluate_function: torch::autograd::AccumulateGrad', + calls: 161, + host_duration: 7655, + device_duration: 535, + self_host_duration: 2472, + self_device_duration: 0 + }, + { + name: 'TBackward0', + calls: 1, + host_duration: 16, + device_duration: 0, + self_host_duration: 3, + self_device_duration: 0 + }, + { + name: 'autograd::engine::evaluate_function: TBackward0', + calls: 1, + host_duration: 24, + device_duration: 0, + self_host_duration: 8, + self_device_duration: 0 + }, + { + name: 'aten::_reshape_alias', + calls: 1, + host_duration: 5, + device_duration: 0, + self_host_duration: 5, + self_device_duration: 0 + }, + { + name: 'aten::reshape', + calls: 1, + host_duration: 10, + device_duration: 0, + self_host_duration: 5, + self_device_duration: 0 + }, + { + name: 'ReshapeAliasBackward0', + calls: 1, + host_duration: 17, + device_duration: 0, + self_host_duration: 7, + self_device_duration: 0 + }, + { + name: + 'autograd::engine::evaluate_function: ReshapeAliasBackward0', + calls: 1, + host_duration: 27, + device_duration: 0, + self_host_duration: 10, + self_device_duration: 0 + }, + { + name: 'aten::expand', + calls: 1, + host_duration: 10, + device_duration: 0, + self_host_duration: 9, + self_device_duration: 0 + }, + { + name: 'aten::to', + calls: 1, + host_duration: 1, + device_duration: 0, + self_host_duration: 1, + self_device_duration: 0 + }, + { + name: 'aten::div', + calls: 1, + host_duration: 63, + device_duration: 37, + self_host_duration: 45, + self_device_duration: 37 + }, + { + name: 'MeanBackward1', + calls: 1, + host_duration: 83, + device_duration: 37, + self_host_duration: 9, + self_device_duration: 0 + }, + { + name: 'autograd::engine::evaluate_function: MeanBackward1', + calls: 1, + host_duration: 99, + device_duration: 37, + self_host_duration: 16, + self_device_duration: 0 + }, + { + name: 'aten::threshold_backward', + calls: 49, + host_duration: 1863, + device_duration: 9003, + self_host_duration: 1203, + self_device_duration: 9003 + }, + { + name: 'ReluBackward0', + calls: 49, + host_duration: 2330, + device_duration: 9003, + self_host_duration: 467, + self_device_duration: 0 + }, + { + name: 'autograd::engine::evaluate_function: ReluBackward0', + calls: 49, + host_duration: 3313, + device_duration: 9003, + self_host_duration: 983, + self_device_duration: 0 + }, + { + name: 'AddBackward0', + calls: 16, + host_duration: 14, + device_duration: 0, + self_host_duration: 14, + self_device_duration: 0 + }, + { + name: 'autograd::engine::evaluate_function: AddBackward0', + calls: 16, + host_duration: 135, + device_duration: 0, + self_host_duration: 121, + self_device_duration: 0 + }, + { + name: 'aten::empty', + calls: 370, + host_duration: 4638, + device_duration: 0, + self_host_duration: 4638, + self_device_duration: 0 + }, + { + name: 'aten::cudnn_batch_norm_backward', + calls: 53, + host_duration: 5047, + device_duration: 22244, + self_host_duration: 2219, + self_device_duration: 22244 + }, + { + name: 'CudnnBatchNormBackward0', + calls: 53, + host_duration: 5637, + device_duration: 22244, + self_host_duration: 590, + self_device_duration: 0 + }, + { + name: + 'autograd::engine::evaluate_function: CudnnBatchNormBackward0', + calls: 53, + host_duration: 7407, + device_duration: 22244, + self_host_duration: 1770, + self_device_duration: 0 + }, + { + name: 'aten::cudnn_convolution_backward_input', + calls: 52, + host_duration: 9345, + device_duration: 37854, + self_host_duration: 6945, + self_device_duration: 37854 + }, + { + name: 'aten::cudnn_convolution_backward_weight', + calls: 53, + host_duration: 9886, + device_duration: 44650, + self_host_duration: 5378, + self_device_duration: 44650 + }, + { + name: 'aten::cudnn_convolution_backward', + calls: 53, + host_duration: 20453, + device_duration: 82504, + self_host_duration: 1222, + self_device_duration: 0 + }, + { + name: 'CudnnConvolutionBackward0', + calls: 53, + host_duration: 21000, + device_duration: 82504, + self_host_duration: 547, + self_device_duration: 0 + }, + { + name: + 'autograd::engine::evaluate_function: CudnnConvolutionBackward0', + calls: 53, + host_duration: 23024, + device_duration: 87731, + self_host_duration: 1440, + self_device_duration: 0 + }, + { + name: 'aten::add', + calls: 16, + host_duration: 584, + device_duration: 5227, + self_host_duration: 374, + self_device_duration: 5227 + }, + { + name: 'aten::fill_', + calls: 1, + host_duration: 26, + device_duration: 230, + self_host_duration: 12, + self_device_duration: 230 + }, + { + name: 'aten::zero_', + calls: 1, + host_duration: 33, + device_duration: 230, + self_host_duration: 7, + self_device_duration: 0 + }, + { + name: 'aten::max_pool2d_with_indices_backward', + calls: 1, + host_duration: 73, + device_duration: 1513, + self_host_duration: 30, + self_device_duration: 1283 + }, + { + name: 'MaxPool2DWithIndicesBackward0', + calls: 1, + host_duration: 83, + device_duration: 1513, + self_host_duration: 10, + self_device_duration: 0 + }, + { + name: + 'autograd::engine::evaluate_function: MaxPool2DWithIndicesBackward0', + calls: 1, + host_duration: 106, + device_duration: 1513, + self_host_duration: 23, + self_device_duration: 0 + } + ] + }, + path: '0-21' + }, + { + left: { + name: 'aten::zeros', + duration: 160, + device_duration: 0, + total_duration: 160, + aggs: [ + { + name: 'aten::empty', + calls: 1, + host_duration: 87, + device_duration: 0, + self_host_duration: 87, + self_device_duration: 0 + }, + { + name: 'aten::zero_', + calls: 1, + host_duration: 4, + device_duration: 0, + self_host_duration: 4, + self_device_duration: 0 + }, + { + name: 'aten::zeros', + calls: 1, + host_duration: 160, + device_duration: 0, + self_host_duration: 69, + self_device_duration: 0 + } + ] + }, + right: { + name: 'aten::zeros', + duration: 119, + device_duration: 0, + total_duration: 119, + aggs: [ + { + name: 'aten::empty', + calls: 1, + host_duration: 105, + device_duration: 0, + self_host_duration: 105, + self_device_duration: 0 + }, + { + name: 'aten::zero_', + calls: 1, + host_duration: 2, + device_duration: 0, + self_host_duration: 2, + self_device_duration: 0 + }, + { + name: 'aten::zeros', + calls: 1, + host_duration: 119, + device_duration: 0, + self_host_duration: 12, + self_device_duration: 0 + } + ] + }, + path: '0-22' + }, + { + left: { + name: 'Optimizer.step#SGD.step', + duration: 75435, + device_duration: 1295, + total_duration: 1295, + aggs: [ + { + name: 'aten::empty', + calls: 1, + host_duration: 40, + device_duration: 0, + self_host_duration: 40, + self_device_duration: 0 + }, + { + name: 'aten::mul_', + calls: 161, + host_duration: 11945, + device_duration: 401, + self_host_duration: 9568, + self_device_duration: 401 + }, + { + name: 'aten::add_', + calls: 322, + host_duration: 22480, + device_duration: 894, + self_host_duration: 17805, + self_device_duration: 894 + } + ] + }, + right: { + name: 'Optimizer.step#SGD.step', + duration: 16687, + device_duration: 1298, + total_duration: 1298, + aggs: [ + { + name: 'aten::empty', + calls: 1, + host_duration: 8, + device_duration: 0, + self_host_duration: 8, + self_device_duration: 0 + }, + { + name: 'aten::mul_', + calls: 161, + host_duration: 3440, + device_duration: 404, + self_host_duration: 1824, + self_device_duration: 404 + }, + { + name: 'aten::add_', + calls: 322, + host_duration: 6161, + device_duration: 894, + self_host_duration: 3186, + self_device_duration: 894 + } + ] + }, + path: '0-23' + } + ] + }) + } +} diff --git a/tb_plugins/profiling/tb_plugin/fe/src/api/openapi.yaml b/tb_plugins/profiling/tb_plugin/fe/src/api/openapi.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0218adb9d74ed1505d6b86b5ac6550ec33539144 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/api/openapi.yaml @@ -0,0 +1,1204 @@ +openapi: 3.0.1 +info: + title: Pytorch profile API + version: 1.0.0 +servers: + - url: . +paths: + /runs: + get: + responses: + '200': + description: successful operation + content: + '*/*': + schema: + $ref: '#/components/schemas/Runs' + /views: + get: + parameters: + - in: query + name: run + required: true + schema: + type: string + responses: + '200': + description: successful views + content: + '*/*': + schema: + type: array + items: + type: string + enum: + - Overview + - Operator + - Kernel + - Trace + - Distributed + - Memory + /workers: + get: + parameters: + - in: query + name: run + required: true + schema: + type: string + - in: query + name: view + required: true + schema: + type: string + responses: + '200': + description: successful workers + content: + '*/*': + schema: + type: array + items: + type: string + /spans: + get: + parameters: + - in: query + name: run + required: true + schema: + type: string + - in: query + name: worker + required: true + schema: + type: string + responses: + '200': + description: successful spans + content: + '*/*': + schema: + type: array + items: + type: string + /overview: + get: + parameters: + - in: query + name: run + required: true + schema: + type: string + - in: query + name: worker + required: true + schema: + type: string + - in: query + name: span + required: true + schema: + type: string + responses: + '200': + description: successful operation + content: + '*/*': + schema: + $ref: '#/components/schemas/Overview' + /operation: + get: + parameters: + - in: query + name: run + required: true + schema: + type: string + - in: query + name: worker + required: true + schema: + type: string + - in: query + name: span + required: true + schema: + type: string + - in: query + name: group_by + required: true + schema: + type: string + enum: + - Operation + - OperationAndInputShape + description: Group By + responses: + '200': + description: successful operation + content: + '*/*': + schema: + $ref: '#/components/schemas/OperatorGraph' + /operation/table: + get: + parameters: + - in: query + name: run + required: true + schema: + type: string + - in: query + name: worker + required: true + schema: + type: string + - in: query + name: span + required: true + schema: + type: string + - in: query + name: group_by + required: true + schema: + type: string + enum: + - Operation + - OperationAndInputShape + description: Group By + responses: + '200': + description: successful operation + content: + '*/*': + schema: + type: object + required: + - metadata + - data + properties: + metadata: + $ref: '#/components/schemas/TableMetadata' + data: + $ref: '#/components/schemas/OperationTableData' + /operation/stack: + get: + parameters: + - in: query + name: run + required: true + schema: + type: string + - in: query + name: worker + required: true + schema: + type: string + - in: query + name: span + required: true + schema: + type: string + - in: query + name: group_by + required: true + schema: + type: string + enum: + - Operation + - OperationAndInputShape + description: Group By + - in: query + name: op_name + required: true + schema: + type: string + - in: query + name: input_shape + schema: + type: string + responses: + '200': + description: successful operation + content: + '*/*': + schema: + type: object + required: + - metadata + - data + properties: + metadata: + $ref: '#/components/schemas/TableMetadata' + data: + $ref: '#/components/schemas/CallStackTableData' + /distributed/overlap: + get: + parameters: + - in: query + name: run + required: true + schema: + type: string + - in: query + name: worker + required: true + schema: + type: string + - in: query + name: span + required: true + schema: + type: string + responses: + '200': + description: successful operation + content: + '*/*': + schema: + $ref: '#/components/schemas/DistributedGraph' + /distributed/waittime: + get: + parameters: + - in: query + name: run + required: true + schema: + type: string + - in: query + name: worker + required: true + schema: + type: string + - in: query + name: span + required: true + schema: + type: string + responses: + '200': + description: successful operation + content: + '*/*': + schema: + $ref: '#/components/schemas/DistributedGraph' + /distributed/commops: + get: + parameters: + - in: query + name: run + required: true + schema: + type: string + - in: query + name: worker + required: true + schema: + type: string + - in: query + name: span + required: true + schema: + type: string + responses: + '200': + description: successful operation + content: + '*/*': + schema: + type: object + required: + - metadata + - data + properties: + metadata: + type: object + required: + - title + properties: + title: + type: string + data: + type: object + /distributed/gpuinfo: + get: + parameters: + - in: query + name: run + required: true + schema: + type: string + - in: query + name: worker + required: true + schema: + type: string + - in: query + name: span + required: true + schema: + type: string + responses: + '200': + description: successful operation + content: + '*/*': + schema: + $ref: '#/components/schemas/GpuInfo' + /memory: + get: + parameters: + - in: query + name: run + required: true + schema: + type: string + - in: query + name: worker + required: true + schema: + type: string + - in: query + name: span + required: true + schema: + type: string + - in: query + name: start_ts + required: false + schema: + type: number + - in: query + name: end_ts + required: false + schema: + type: number + responses: + '200': + description: successful operation + content: + '*/*': + schema: + $ref: '#/components/schemas/MemoryStatsData' + /memory_curve: + get: + parameters: + - in: query + name: run + required: true + schema: + type: string + - in: query + name: worker + required: true + schema: + type: string + - in: query + name: span + required: true + schema: + type: string + responses: + '200': + description: successful operation + content: + '*/*': + schema: + $ref: '#/components/schemas/MemoryCurveData' + /memory_events: + get: + parameters: + - in: query + name: run + required: true + schema: + type: string + - in: query + name: worker + required: true + schema: + type: string + - in: query + name: span + required: true + schema: + type: string + - in: query + name: start_ts + required: false + schema: + type: number + - in: query + name: end_ts + required: false + schema: + type: number + responses: + '200': + description: successful operation + content: + '*/*': + schema: + $ref: '#/components/schemas/MemoryEventsData' + /kernel: + get: + parameters: + - in: query + name: run + required: true + schema: + type: string + - in: query + name: worker + required: true + schema: + type: string + - in: query + name: span + required: true + schema: + type: string + - in: query + name: group_by + required: true + schema: + type: string + enum: + - Kernel + - KernelNameAndOpName + description: Group By + responses: + '200': + description: successful operation + content: + '*/*': + schema: + $ref: '#/components/schemas/KernelGraph' + /kernel/table: + get: + parameters: + - in: query + name: run + required: true + schema: + type: string + - in: query + name: worker + required: true + schema: + type: string + - in: query + name: span + required: true + schema: + type: string + - in: query + name: group_by + required: false + schema: + type: string + enum: + - Kernel + - KernelNameAndOpName + description: Group By + responses: + '200': + description: successful kernel + content: + '*/*': + schema: + $ref: '#/components/schemas/TableData' + /kernel/tc_pie: + get: + parameters: + - in: query + name: run + required: true + schema: + type: string + - in: query + name: worker + required: true + schema: + type: string + - in: query + name: span + required: true + schema: + type: string + responses: + '200': + description: successful operation + content: + '*/*': + schema: + $ref: '#/components/schemas/TensorCoresGraph' + /trace: + get: + parameters: + - in: query + name: run + required: true + schema: + type: string + - in: query + name: worker + required: true + schema: + type: string + - in: query + name: span + required: true + schema: + type: string + responses: + '200': + description: successful trace data + content: + '*/*': + schema: + type: object + /module: + get: + parameters: + - in: query + name: run + required: true + schema: + type: string + - in: query + name: worker + required: true + schema: + type: string + - in: query + name: span + required: true + schema: + type: string + responses: + '200': + description: successful operation + content: + '*/*': + schema: + $ref: '#/components/schemas/ModuleViewData' + /tree: + get: + parameters: + - in: query + name: run + required: true + schema: + type: string + - in: query + name: worker + required: true + schema: + type: string + - in: query + name: span + required: true + schema: + type: string + responses: + '200': + description: successful operation + content: + '*/*': + schema: + $ref: '#/components/schemas/OperatorNode' + /diffnode: + get: + parameters: + - in: query + name: run + required: true + schema: + type: string + - in: query + name: worker + required: true + schema: + type: string + - in: query + name: span + required: true + schema: + type: string + - in: query + name: exp_run + required: true + schema: + type: string + - in: query + name: exp_worker + required: true + schema: + type: string + - in: query + name: exp_span + required: true + schema: + type: string + - in: query + name: path + required: false + schema: + type: string + responses: + '200': + description: successful operation + content: + '*/*': + schema: + $ref: '#/components/schemas/DiffNode' +components: + schemas: + Runs: + type: object + required: + - runs + - loading + properties: + runs: + type: array + items: + type: string + loading: + type: boolean + Performance: + type: object + required: + - name + properties: + name: + type: string + description: + type: string + value: + type: string + extra: + type: string + children: + type: array + items: + $ref: '#/components/schemas/Performance' + Environment: + type: object + required: + - title + - value + properties: + title: + type: string + value: + type: string + GraphColumn: + type: object + required: + - type + - name + properties: + type: + type: string + name: + type: string + role: + type: string + p: + type: object + properties: + html: + type: boolean + ValueAndFormat: + type: object + required: + - v + - f + properties: + v: + oneOf: + - type: string + - type: number + - type: boolean + f: + type: string + Graph: + type: object + required: + - columns + - rows + properties: + title: + type: string + columns: + type: array + items: + $ref: '#/components/schemas/GraphColumn' + rows: + type: array + items: + type: array + items: + oneOf: + - type: string + - type: number + - type: boolean + - $ref: '#/components/schemas/ValueAndFormat' + Overview: + type: object + required: + - performance + - environments + - steps + - recommendations + properties: + performance: + type: array + items: + $ref: '#/components/schemas/Performance' + environments: + type: array + items: + $ref: '#/components/schemas/Environment' + steps: + $ref: '#/components/schemas/Graph' + recommendations: + type: string + gpu_metrics: + $ref: '#/components/schemas/GpuMetrics' + OperatorGraph: + type: object + required: + - device_total_time + - device_self_time + - host_total_time + - host_self_time + properties: + device_total_time: + $ref: '#/components/schemas/Graph' + device_self_time: + $ref: '#/components/schemas/Graph' + host_total_time: + $ref: '#/components/schemas/Graph' + host_self_time: + $ref: '#/components/schemas/Graph' + TableMetadata: + type: object + required: + - sort + properties: + sort: + type: string + tooltips: + type: object + TableData: + type: object + required: + - metadata + - data + properties: + data: + $ref: '#/components/schemas/Graph' + metadata: + $ref: '#/components/schemas/TableMetadata' + KernelGraph: + type: object + required: + - total + properties: + total: + $ref: '#/components/schemas/Graph' + TensorCoresGraph: + type: object + required: + - total + properties: + total: + $ref: '#/components/schemas/Graph' + OperationTableData: + type: array + items: + type: object + required: + - name + - calls + - host_self_duration + - host_total_duration + - has_call_stack + properties: + name: + type: string + input_shape: + type: string + calls: + type: number + device_self_duration: + type: number + device_total_duration: + type: number + host_self_duration: + type: number + host_total_duration: + type: number + has_call_stack: + type: boolean + tc_eligible: + type: string + tc_self_ratio: + type: number + tc_total_ratio: + type: number + CallStackTableData: + type: array + items: + type: object + required: + - name + - calls + - host_self_duration + - host_total_duration + properties: + name: + type: string + input_shape: + type: string + calls: + type: number + device_self_duration: + type: number + device_total_duration: + type: number + host_self_duration: + type: number + host_total_duration: + type: number + call_stack: + type: string + tc_eligible: + type: string + tc_self_ratio: + type: number + tc_total_ratio: + type: number + DistributedGraph: + type: object + required: + - metadata + - data + properties: + metadata: + type: object + required: + - title + - legends + - units + properties: + title: + type: string + legends: + type: array + items: + type: string + units: + type: string + data: + type: object + GpuInfo: + type: object + required: + - metadata + - data + properties: + metadata: + type: object + required: + - title + properties: + title: + type: string + data: + type: object + GpuMetrics: + type: object + required: + - data + - tooltip + properties: + data: + type: array + items: + $ref: '#/components/schemas/GpuMetric' + tooltip: + type: string + GpuMetric: + type: object + required: + - title + - value + properties: + title: + type: string + value: + type: string + MemoryStatsData: + type: object + required: + - metadata + - columns + - rows + properties: + metadata: + $ref: '#/components/schemas/MemoryStatsTableMetadata' + columns: + type: array + items: + $ref: '#/components/schemas/GraphColumn' + rows: + type: object + MemoryEventsData: + type: object + required: + - metadata + - columns + - rows + properties: + metadata: + $ref: '#/components/schemas/MemoryEventsTableMetadata' + columns: + type: array + items: + $ref: '#/components/schemas/GraphColumn' + rows: + type: object + MemoryEventsTableMetadata: + type: object + required: + - title + - default_device + - value + properties: + title: + type: string + default_device: + type: string + search: + type: string + sort: + type: string + MemoryStatsTableMetadata: + type: object + required: + - title + - default_device + - search + - sort + - value + properties: + title: + type: string + default_device: + type: string + search: + type: string + sort: + type: string + MemoryCurveDataMetadata: + type: object + required: + - default_device + - devices + - peaks + - totals + - first_ts + - time_metric + - memory_metric + - time_factor + - memory_factor + properties: + default_device: + type: string + devices: + type: array + items: + type: string + peaks: + type: object + totals: + type: object + first_ts: + type: number + time_metric: + type: string + memory_metric: + type: string + time_factor: + type: number + memory_factor: + type: number + MemoryCurveData: + type: object + required: + - metadata + - columns + - rows + properties: + metadata: + $ref: '#/components/schemas/MemoryCurveDataMetadata' + columns: + type: array + items: + $ref: '#/components/schemas/GraphColumn' + rows: + type: object + KeyedColumn: + type: object + required: + - type + - name + - key + properties: + type: + type: string + name: + type: string + key: + type: string + ModuleViewData: + type: object + required: + - columns + - data + properties: + columns: + type: array + items: + $ref: '#/components/schemas/KeyedColumn' + data: + type: array + items: + $ref: '#/components/schemas/ModuleStats' + ModuleStats: + type: object + required: + - name + - id + - occurences + - operators + - host_duration + - self_host_duration + - device_duration + - self_device_duration + - avg_duration + - children + properties: + name: + type: string + id: + type: string + occurences: + type: number + operators: + type: number + host_duration: + type: number + self_host_duration: + type: number + device_duration: + type: number + self_device_duration: + type: number + avg_duration: + type: number + children: + type: array + items: + $ref: '#/components/schemas/ModuleStats' + OperatorNode: + type: object + required: + - name + - start_time + - end_time + - type + - tid + - children + properties: + name: + type: string + start_time: + type: number + end_time: + type: number + type: + type: string + tid: + type: number + children: + type: array + items: + $ref: '#/components/schemas/OperatorNode' + OpAgg: + type: object + required: + - name + - calls + - host_duration + - device_duration + - self_host_duration + - self_device_duration + properties: + name: + type: string + calls: + type: number + host_duration: + type: number + device_duration: + type: number + self_host_duration: + type: number + self_device_duration: + type: number + OpStats: + type: object + required: + - name + - duration + - device_duration + - total_duration + - aggs + properties: + name: + type: string + duration: + type: number + device_duration: + type: number + total_duration: + type: number + aggs: + type: array + items: + $ref: '#/components/schemas/OpAgg' + DiffNode: + type: object + required: + - left + - right + - children + - path + properties: + left: + $ref: '#/components/schemas/OpStats' + right: + $ref: '#/components/schemas/OpStats' + path: + type: string + children: + type: array + items: + $ref: '#/components/schemas/DiffNode' diff --git a/tb_plugins/profiling/tb_plugin/fe/src/app.tsx b/tb_plugins/profiling/tb_plugin/fe/src/app.tsx new file mode 100644 index 0000000000000000000000000000000000000000..012e39dbcc18592950bb52607c108e272dbdcc77 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/app.tsx @@ -0,0 +1,605 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +import Box from '@material-ui/core/Box' +import Card from '@material-ui/core/Card' +import CardContent from '@material-ui/core/CardContent' +import CardHeader from '@material-ui/core/CardHeader' +import ClickAwayListener from '@material-ui/core/ClickAwayListener' +import CssBaseline from '@material-ui/core/CssBaseline' +import Divider from '@material-ui/core/Divider' +import Drawer from '@material-ui/core/Drawer' +import Fab from '@material-ui/core/Fab' +import FormControl from '@material-ui/core/FormControl' +import IconButton from '@material-ui/core/IconButton' +import ListSubheader from '@material-ui/core/ListSubheader' +import MenuItem from '@material-ui/core/MenuItem' +import Select, { SelectProps } from '@material-ui/core/Select' +import { makeStyles } from '@material-ui/core/styles' +import Tab from '@material-ui/core/Tab' +import Tabs from '@material-ui/core/Tabs' +import Typography from '@material-ui/core/Typography' +import ChevronLeftIcon from '@material-ui/icons/ChevronLeft' +import ChevronRightIcon from '@material-ui/icons/ChevronRight' +import 'antd/es/button/style/css' +import 'antd/es/list/style/css' +import 'antd/es/table/style/css' +import clsx from 'clsx' +import * as React from 'react' +import * as api from './api' +import { DiffOverview } from './components/DiffOverview' +import { DistributedView } from './components/DistributedView' +import { FullCircularProgress } from './components/FullCircularProgress' +import { Kernel } from './components/Kernel' +import { MemoryView } from './components/MemoryView' +import { ModuleView } from './components/ModuleView' +import { Operator } from './components/Operator' +import { Overview } from './components/Overview' +import { TraceView } from './components/TraceView' +import { setup } from './setup' +import './styles.css' +import { firstOrUndefined, sleep } from './utils' + +export enum Views { + Overview = 'Overview', + Operator = 'Operator', + Kernel = 'Kernel', + Trace = 'Trace', + Distributed = 'Distributed', + Memory = 'Memory', + Module = 'Module', + Lightning = 'Lightning' +} + +const ViewNames = { + [Views.Overview]: Views.Overview, + [Views.Operator]: Views.Operator, + [Views.Kernel]: 'GPU Kernel', + [Views.Trace]: Views.Trace, + [Views.Distributed]: Views.Distributed, + [Views.Memory]: Views.Memory, + [Views.Module]: Views.Module, + [Views.Lightning]: Views.Lightning +} + +const drawerWidth = 340 +const useStyles = makeStyles((theme) => ({ + root: { + display: 'flex' + }, + appBar: { + zIndex: theme.zIndex.drawer + 1, + transition: theme.transitions.create(['width', 'margin'], { + easing: theme.transitions.easing.sharp, + duration: theme.transitions.duration.leavingScreen + }) + }, + appBarShift: { + marginLeft: drawerWidth, + width: `calc(100% - ${drawerWidth}px)`, + transition: theme.transitions.create(['width', 'margin'], { + easing: theme.transitions.easing.sharp, + duration: theme.transitions.duration.enteringScreen + }) + }, + menuButton: { + marginRight: 36 + }, + hide: { + display: 'none' + }, + drawer: { + width: drawerWidth, + flexShrink: 0, + whiteSpace: 'nowrap' + }, + drawerOpen: { + width: drawerWidth, + transition: theme.transitions.create('width', { + easing: theme.transitions.easing.sharp, + duration: theme.transitions.duration.enteringScreen + }) + }, + drawerClose: { + transition: theme.transitions.create('width', { + easing: theme.transitions.easing.sharp, + duration: theme.transitions.duration.leavingScreen + }), + overflowX: 'hidden', + width: 0, + [theme.breakpoints.up('sm')]: { + width: 0 + } + }, + toolbar: { + display: 'flex', + alignItems: 'center', + justifyContent: 'flex-end', + padding: theme.spacing(0, 1), + // necessary for content to be below app bar + ...theme.mixins.toolbar + }, + content: { + flexGrow: 1, + padding: theme.spacing(3) + }, + formControl: { + margin: theme.spacing(1), + minWidth: 120 + }, + fab: { + marginLeft: theme.spacing(1), + marginTop: theme.spacing(1), + position: 'absolute' + }, + iconButton: { + padding: '8px' + } +})) + +export const App = () => { + const classes = useStyles() + + // #region - State + + const [selectedTab, setSelectedTab] = React.useState(0) + + const [run, setRun] = React.useState('') + const [runs, setRuns] = React.useState([]) + const [runsLoading, setRunsLoading] = React.useState(true) + + const [workers, setWorkers] = React.useState([]) + const [worker, setWorker] = React.useState('') + + const [spans, setSpans] = React.useState([]) + const [span, setSpan] = React.useState('') + + const [views, setViews] = React.useState([]) + const [view, setView] = React.useState('') + const [loaded, setLoaded] = React.useState(false) + const iframeRef = React.useRef(null) + + const [diffLeftWorkerOptions, setDiffLeftWorkerOptions] = React.useState< + string[] + >([]) + const [diffLeftSpansOptions, setDiffLeftSpansOptions] = React.useState< + string[] + >([]) + const [diffLeftRun, setDiffLeftRun] = React.useState('') + const [diffLeftWorker, setDiffLeftWorker] = React.useState('') + const [diffLeftSpan, setDiffLeftSpan] = React.useState('') + + const [diffRightWorkerOptions, setDiffRightWorkerOptions] = React.useState< + string[] + >([]) + const [diffRightSpansOptions, setDiffRightSpansOptions] = React.useState< + string[] + >([]) + const [diffRightRun, setDiffRightRun] = React.useState('') + const [diffRightWorker, setDiffRightWorker] = React.useState('') + const [diffRightSpan, setDiffRightSpan] = React.useState('') + + const [open, setOpen] = React.useState(true) + + // #endregion + + React.useEffect(() => { + setup().then(() => { + setLoaded(true) + }) + }, []) + + const continuouslyFetchRuns = async () => { + while (true) { + try { + const runs = await api.defaultApi.runsGet() + setRuns(runs.runs) + setRunsLoading(runs.loading) + } catch (e) { + console.info('Cannot fetch runs: ', e) + } + await sleep(5000) + } + } + + React.useEffect(() => { + continuouslyFetchRuns() + }, []) + + React.useEffect(() => { + if (!run || !runs.includes(run)) { + setRun(firstOrUndefined(runs) ?? '') + } + }, [runs]) + + // #region - Diff Left + + React.useEffect(() => { + if (diffLeftRun) { + api.defaultApi.workersGet(diffLeftRun, Views.Overview).then((workers) => { + setDiffLeftWorkerOptions(workers) + }) + } + }, [diffLeftRun]) + + React.useEffect(() => { + if (diffLeftRun && diffLeftWorker) { + api.defaultApi.spansGet(diffLeftRun, diffLeftWorker).then((spans) => { + setDiffLeftSpansOptions(spans) + }) + } + }, [diffLeftRun, diffLeftWorker]) + + // #endregion + + // #region - Diff Right + + React.useEffect(() => { + if (diffRightRun) { + api.defaultApi + .workersGet(diffRightRun, Views.Overview) + .then((workers) => { + setDiffRightWorkerOptions(workers) + }) + } + }, [diffRightRun]) + + React.useEffect(() => { + if (diffRightRun && diffRightWorker) { + api.defaultApi.spansGet(diffRightRun, diffRightWorker).then((spans) => { + setDiffRightSpansOptions(spans) + }) + } + }, [diffRightRun, diffRightWorker]) + + // #endregion + + // #region - normal + + React.useEffect(() => { + if (run) { + api.defaultApi.viewsGet(run).then((rawViews) => { + const views = rawViews + .map((v) => Views[Views[v as Views]]) + .filter(Boolean) + setViews(views) + }) + } + }, [run]) + + React.useEffect(() => { + setView(firstOrUndefined(views) ?? '') + }, [views]) + + React.useEffect(() => { + if (run && view) { + api.defaultApi.workersGet(run, view).then((workers) => { + setWorkers(workers) + }) + } + }, [run, view]) + + React.useEffect(() => { + setWorker(firstOrUndefined(workers) ?? '') + }, [workers]) + + React.useEffect(() => { + if (run && worker) { + api.defaultApi.spansGet(run, worker).then((spans) => { + setSpans(spans) + }) + } + }, [run, worker]) + + React.useEffect(() => { + setSpan(firstOrUndefined(spans) ?? '') + }, [spans]) + + // #endregion + + // #region - Event Handler + const handleTabChange = (event: React.ChangeEvent<{}>, value: any) => { + setSelectedTab(value as number) + } + + const handleRunChange: SelectProps['onChange'] = (event) => { + setRun(event.target.value as string) + setView('') + setWorker('') + setSpan('') + } + + const handleViewChange: SelectProps['onChange'] = (event) => { + setView(event.target.value as Views) + setWorker('') + setSpan('') + } + + const handleWorkerChange: SelectProps['onChange'] = (event) => { + setWorker(event.target.value as string) + setSpan('') + } + + const handleSpanChange: SelectProps['onChange'] = (event) => { + setSpan(event.target.value as string) + } + + const handleDiffLeftRunChange: SelectProps['onChange'] = (event) => { + setDiffLeftRun(event.target.value as string) + setDiffLeftWorker('') + setDiffLeftSpan('') + } + + const handleDiffLeftWorkerChange: SelectProps['onChange'] = (event) => { + setDiffLeftWorker(event.target.value as string) + setDiffLeftSpan('') + } + + const handleDiffLeftSpanChange: SelectProps['onChange'] = (event) => { + setDiffLeftSpan(event.target.value as string) + } + + const handleDiffRightRunChange: SelectProps['onChange'] = (event) => { + setDiffRightRun(event.target.value as string) + setDiffRightWorker('') + setDiffRightSpan('') + } + + const handleDiffRightWorkerChange: SelectProps['onChange'] = (event) => { + setDiffRightWorker(event.target.value as string) + setDiffRightSpan('') + } + + const handleDiffRightSpanChange: SelectProps['onChange'] = (event) => { + setDiffRightSpan(event.target.value as string) + } + + const handleDrawerOpen = () => { + setOpen(true) + SetIframeActive() + } + + const handleDrawerClose = () => { + setOpen(false) + SetIframeActive() + } + + const SetIframeActive = () => { + iframeRef.current?.focus() + } + + // #endregion + + const renderContent = () => { + if (!runsLoading && runs.length == 0) { + return ( + + + + There are not any runs in the log folder. + + + ) + } + + if (!loaded || !run || !worker || !view || !span) { + return + } + + if (selectedTab === 0) { + switch (view) { + case Views.Overview: + return + case Views.Operator: + return + case Views.Kernel: + return + case Views.Trace: + return ( + + ) + case Views.Distributed: + return + case Views.Memory: + return + case Views.Module: + case Views.Lightning: + return + } + } else { + return ( + + ) + } + } + + const spanComponent = () => { + const spanFragment = ( + + Spans + + + + + + + ) + + if (!spans || spans.length <= 1) { + return
{spanFragment}
+ } else { + return spanFragment + } + } + + return ( +
+ + +
+ + + +
+ + + + + + + + {selectedTab == 0 ? ( + <> + Runs + + + + + + Views + + + + + + Workers + + + + + + {spanComponent()} + + ) : ( + <> +   Baseline + Runs + + + + Workers + + + + + Spans + + + + + + +   Experimental + Runs + + + + Workers + + + + Spans + + + + + )} +
+ {!open && ( + + + + )} +
{renderContent()}
+
+ ) +} diff --git a/tb_plugins/profiling/tb_plugin/fe/src/components/DataLoading.tsx b/tb_plugins/profiling/tb_plugin/fe/src/components/DataLoading.tsx new file mode 100644 index 0000000000000000000000000000000000000000..e2967bdf74196ad74a13f2d2f8b1799911d3b553 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/components/DataLoading.tsx @@ -0,0 +1,19 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +import * as React from 'react' +import { FullCircularProgress } from './FullCircularProgress' + +interface IProps { + value: T | undefined | null + children: (t: T) => JSX.Element +} + +export function DataLoading(props: IProps) { + if (props.value === undefined || props.value === null) { + return + } + + return props.children(props.value) +} diff --git a/tb_plugins/profiling/tb_plugin/fe/src/components/DiffOverview.tsx b/tb_plugins/profiling/tb_plugin/fe/src/components/DiffOverview.tsx new file mode 100644 index 0000000000000000000000000000000000000000..0fc67e71215978674b0afe0477ab982145afded7 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/components/DiffOverview.tsx @@ -0,0 +1,855 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +import Button from '@material-ui/core/Button' +import Card from '@material-ui/core/Card' +import CardContent from '@material-ui/core/CardContent' +import CardHeader from '@material-ui/core/CardHeader' +import Grid from '@material-ui/core/Grid' +import { makeStyles } from '@material-ui/core/styles' +import Typography from '@material-ui/core/Typography' +import ChevronLeftIcon from '@material-ui/icons/ChevronLeft' +import { Select, Table } from 'antd' +import * as React from 'react' +import * as api from '../api' +import { useResizeEventDependency } from '../utils/resize' +import { FullCircularProgress } from './FullCircularProgress' + +const { Option } = Select + +const topGraphHeight = 230 + +const useStyles = makeStyles((theme) => ({ + root: { + flexGrow: 1 + }, + pre: { + '& ul': { + margin: 0, + paddingLeft: theme.spacing(3), + ...theme.typography.body1 + }, + '& li': {}, + '& a': { + color: '#ffa726' + }, + '& a:active': { + color: '#ffa726' + }, + '& p': { + margin: 0, + ...theme.typography.subtitle1, + fontWeight: theme.typography.fontWeightBold + } + }, + topGraph: { + height: topGraphHeight + 40 + }, + iconButton: { + padding: '8px' + } +})) + +export interface DiffColumnChartIProps { + rawData: any[] + selectCallback: (row: number, column: number) => void +} + +export interface DiffStepChartIProps { + rawData: any[] +} + +const DiffColumnChart: React.FC = ( + props: DiffColumnChartIProps +) => { + const { rawData, selectCallback } = props + const graphRef = React.useRef(null) + const [resizeEventDependency] = useResizeEventDependency() + + React.useLayoutEffect(() => { + const element = graphRef.current + if (!element) return + + let left_duration_data: number[] = [] + let left_accumulated_duration_data: number[] = [] + + let right_duration_data: number[] = [] + let right_accumulated_duration_data: number[] = [] + + for (let i = 0; i < rawData.length; i++) { + let curr = rawData[i] + left_duration_data.push(curr[1]) + right_duration_data.push(curr[2]) + left_accumulated_duration_data.push(curr[3]) + right_accumulated_duration_data.push(curr[4]) + } + + let left_duration_max = Math.max(...left_duration_data) + let right_duration_max = Math.max(...right_duration_data) + let duration_max = Math.max(left_duration_max, right_duration_max) + + let left_accumulated_duration_max = Math.max( + ...left_accumulated_duration_data + ) + let right_accumulated_duration_max = Math.max( + ...right_accumulated_duration_data + ) + let accumulated_max = Math.max( + left_accumulated_duration_max, + right_accumulated_duration_max + ) + + var options = { + title: 'Execution Comparsion', + height: 500, + seriesType: 'bars', + series: { + 0: { type: 'bars', targetAxisIndex: 0 }, + 1: { type: 'bars', targetAxisIndex: 0 }, + 2: { type: 'line', targetAxisIndex: 1 }, + 3: { type: 'line', targetAxisIndex: 1 } + }, + vAxes: { + 0: { + logScale: false, + maxValue: duration_max + }, + 1: { + logScale: false, + maxValue: accumulated_max + } + } + } + + const chart = new google.visualization.ComboChart(element) + const data = google.visualization.arrayToDataTable(rawData) + chart.draw(data, options) + + google.visualization.events.addListener(chart, 'select', (entry: any) => { + var selectedItem = chart.getSelection()[0] + if (selectedItem && selectedItem.hasOwnProperty('row')) { + selectCallback(selectedItem.row, selectedItem.column) + } + }) + + return () => { + chart.clearChart() + } + }, [rawData, resizeEventDependency]) + + return ( +
+
+
+ ) +} + +const DiffStepChart: React.FC = ( + props: DiffStepChartIProps +) => { + const { rawData } = props + const graphRef = React.useRef(null) + const [resizeEventDependency] = useResizeEventDependency() + + React.useLayoutEffect(() => { + const element = graphRef.current + if (!element) return + + var options = { + title: 'Execution Diff', + height: 500 + } + + const chart = new google.visualization.SteppedAreaChart(element) + const data = google.visualization.arrayToDataTable(rawData) + chart.draw(data, options) + + return () => { + chart.clearChart() + } + }, [rawData, resizeEventDependency]) + + return ( +
+
+
+ ) +} + +export interface IProps { + run: string + worker: string + span: string + expRun: string + expWorker: string + expSpan: string +} + +export interface ColumnUnderlyingData { + name: string + path: string + leftAggs: any[] + rightAggs: any[] +} + +export interface TableRow { + key: number + + operator: string + baselineCalls?: number + expCalls?: number + deltaCalls?: number + deltaCallsPercentNumber?: number + deltaCallsPercent?: string + + baselineHostDuration: number + expHostDuration: number + deltaHostDuration: number + deltaHostDurationPercentNumber: number + deltaHostDurationPercent: string + + baselineSelfHostDuration: number + expSelfHostDuration: number + deltaSelfHostDuration: number + deltaSelfHostDurationPercentNumber: number + deltaSelfHostDurationPercent: string + + baselineDeviceDuration: number + expDeviceDuration: number + deltaDeviceDuration: number + deltaDeviceDurationPercentNumber: number + deltaDeviceDurationPercent: string + + baselineSelfDeviceDuration: number + expSelfDeviceDuration: number + deltaSelfDeviceDuration: number + deltaSelfDeviceDurationPercentNumber: number + deltaSelfDeviceDurationPercent: string +} + +let columnChartDataStack: any[][] = [] +let stepChartDataStack: any[][] = [] +let columnUnderlyingDataStack: ColumnUnderlyingData[][] = [] +let columnTableDataSourceStack: TableRow[][] = [] + +export const DiffOverview: React.FC = (props: IProps) => { + // #region - Constant + + const COMPOSITE_NODES_NAME = 'CompositeNodes' + + const hostDurationColumns = [ + { + title: 'Baseline Host Duration (us)', + dataIndex: 'baselineHostDuration', + key: 'baselineHostDuration', + sorter: (a: TableRow, b: TableRow) => + a.baselineHostDuration - b.baselineHostDuration + }, + { + title: 'Exp Host Duration (us)', + dataIndex: 'expHostDuration', + key: 'expHostDuration', + sorter: (a: TableRow, b: TableRow) => + a.expHostDuration - b.expHostDuration + }, + { + title: 'Delta Host Duration (us)', + dataIndex: 'deltaHostDuration', + key: 'deltaHostDuration', + sorter: (a: TableRow, b: TableRow) => + a.deltaHostDuration! - b.deltaHostDuration! + }, + { + title: 'Delta Host Duration%', + dataIndex: 'deltaHostDurationPercent', + key: 'deltaHostDurationPercent', + sorter: (a: TableRow, b: TableRow) => + a.deltaHostDurationPercentNumber! - b.deltaHostDurationPercentNumber! + } + ] + + const selfHostDurationColumns = [ + { + title: 'Baseline Self Host Duration (us)', + dataIndex: 'baselineSelfHostDuration', + key: 'baselineSelfHostDuration', + sorter: (a: TableRow, b: TableRow) => + a.baselineSelfHostDuration - b.baselineSelfHostDuration + }, + { + title: 'Exp Self Host Duration (us)', + dataIndex: 'expSelfHostDuration', + key: 'expSelfHostDuration', + sorter: (a: TableRow, b: TableRow) => + a.expSelfHostDuration - b.expSelfHostDuration + }, + { + title: 'Delta Self Host Duration (us)', + dataIndex: 'deltaSelfHostDuration', + key: 'deltaSelfHostDuration', + sorter: (a: TableRow, b: TableRow) => + a.deltaSelfHostDuration! - b.deltaSelfHostDuration! + }, + { + title: 'Delta Self Host Duration%', + dataIndex: 'deltaSelfHostDurationPercent', + key: 'deltaSelfHostDurationPercent', + sorter: (a: TableRow, b: TableRow) => + a.deltaSelfHostDurationPercentNumber! - + b.deltaSelfHostDurationPercentNumber! + } + ] + + const deviceDurationColumns = [ + { + title: 'Baseline Device Duration (us)', + dataIndex: 'baselineDeviceDuration', + key: 'baselineDeviceDuration', + sorter: (a: TableRow, b: TableRow) => + a.baselineDeviceDuration - b.baselineDeviceDuration + }, + { + title: 'Exp Device Duration (us)', + dataIndex: 'expDeviceDuration', + key: 'expDeviceDuration', + sorter: (a: TableRow, b: TableRow) => + a.expDeviceDuration - b.expDeviceDuration + }, + { + title: 'Delta Device Duration (us)', + dataIndex: 'deltaDeviceDuration', + key: 'deltaDeviceDuration', + sorter: (a: TableRow, b: TableRow) => + a.deltaDeviceDuration! - b.deltaDeviceDuration! + }, + { + title: 'Delta Device Duration%', + dataIndex: 'deltaDeviceDurationPercent', + key: 'deltaDeviceDurationPercent', + sorter: (a: TableRow, b: TableRow) => + a.deltaDeviceDurationPercentNumber! - + b.deltaDeviceDurationPercentNumber! + } + ] + + const selfDeviceDurationColumns = [ + { + title: 'Baseline Self Device Duration (us)', + dataIndex: 'baselineSelfDeviceDuration', + key: 'baselineSelfDeviceDuration', + sorter: (a: TableRow, b: TableRow) => + a.baselineSelfDeviceDuration - b.baselineSelfDeviceDuration + }, + { + title: 'Exp Self Device Duration (us)', + dataIndex: 'expSelfDeviceDuration', + key: 'expSelfDeviceDuration', + sorter: (a: TableRow, b: TableRow) => + a.expSelfDeviceDuration - b.expSelfDeviceDuration + }, + { + title: 'Delta Self Device Duration (us)', + dataIndex: 'deltaSelfDeviceDuration', + key: 'deltaSelfDeviceDuration', + sorter: (a: TableRow, b: TableRow) => + a.deltaSelfDeviceDuration! - b.deltaSelfDeviceDuration! + }, + { + title: 'Delta Self Device Duration%', + dataIndex: 'deltaSelfDeviceDurationPercent', + key: 'deltaSelfDeviceDurationPercent', + sorter: (a: TableRow, b: TableRow) => + a.deltaSelfDeviceDurationPercentNumber! - + b.deltaSelfDeviceDurationPercentNumber! + } + ] + + type IColumnMapType = { [key: string]: any } + + const tableSourceColumnMap: IColumnMapType = { + selfHostDuration: selfHostDurationColumns, + hostDuration: hostDurationColumns, + deviceDuration: deviceDurationColumns, + selfDeviceDuration: selfDeviceDurationColumns + } + + const baseTableColumns = [ + { + title: 'Operator', + dataIndex: 'operator', + key: 'operator', + sorter: (a: TableRow, b: TableRow) => a.operator.localeCompare(b.operator) + }, + { + title: 'Baseline Calls', + dataIndex: 'baselineCalls', + key: 'baselineCalls', + sorter: (a: TableRow, b: TableRow) => a.baselineCalls! - b.baselineCalls! + }, + { + title: 'Exp Calls', + dataIndex: 'expCalls', + key: 'expCalls', + sorter: (a: TableRow, b: TableRow) => a.expCalls! - b.expCalls! + }, + { + title: 'Delta Calls', + dataIndex: 'deltaCalls', + key: 'deltaCalls', + sorter: (a: TableRow, b: TableRow) => a.deltaCalls! - b.deltaCalls! + }, + { + title: 'Delta Calls%', + dataIndex: 'deltaCallsPercent', + key: 'deltaCallsPercent', + sorter: (a: TableRow, b: TableRow) => + a.deltaCallsPercentNumber! - b.deltaCallsPercentNumber! + } + ] + + // #endregion + + // #region - State + const [tableDataSource, setTableDataSource] = React.useState([]) + const { run, worker, span, expRun, expWorker, expSpan } = props + + const [columnUnderlyingData, setColumnUnderlyingData] = React.useState< + ColumnUnderlyingData[] + >([]) + + const [ + rootUnderlyingData, + setRootUnderlyingData + ] = React.useState() + + const [columnChartData, setColumnChartData] = React.useState([]) + const [stepChartData, setStepChartData] = React.useState([]) + + const [ + selectedTableColumnsOptions, + setSelectedTableColumnsOptions + ] = React.useState<[key: string]>(['hostDuration']) + const [selectedTableColumns, setSelectedTableColumns] = React.useState( + [...baseTableColumns, ...hostDurationColumns] + ) + + const [dataStackLevel, setDataStackLevel] = React.useState(0) + const [loading, setLoading] = React.useState(false) + + // #endregion + const classes = useStyles() + + // #region - Event Handler + const handleChartColumnSelect = (row: number, column: number) => { + if (columnUnderlyingData.length === 0) { + return + } + + let selectedUnderlyingData = columnUnderlyingData[row] + if (!selectedUnderlyingData) { + return + } + + let tableDataSource = generateDataSourceFromUnderlyingData( + selectedUnderlyingData + ) + setTableDataSource(tableDataSource) + columnTableDataSourceStack.push(tableDataSource) + + setLoading(true) + + api.defaultApi + .diffnodeGet( + run, + worker, + span, + expRun, + expWorker, + expSpan, + selectedUnderlyingData.path + ) + .then((resp) => handleDiffNodeResp(resp)) + .finally(() => setLoading(false)) + } + + const handleGoBack = () => { + if (columnChartDataStack.length > 1) { + columnChartDataStack.pop() + let top = columnChartDataStack[columnChartDataStack.length - 1] + setColumnChartData(top) + } + + if (stepChartDataStack.length > 1) { + stepChartDataStack.pop() + let top = stepChartDataStack[stepChartDataStack.length - 1] + setStepChartData(top) + } + + if (columnUnderlyingDataStack.length > 0) { + columnUnderlyingDataStack.pop() + let top = columnUnderlyingDataStack[columnUnderlyingDataStack.length - 1] + setColumnUnderlyingData(top) + } + + if (columnTableDataSourceStack.length > 0) { + columnTableDataSourceStack.pop() + let top = + columnTableDataSourceStack[columnTableDataSourceStack.length - 1] + + if (top) { + setTableDataSource(top) + } else { + let tableDataSource = generateDataSourceFromUnderlyingData( + rootUnderlyingData! + ) + setTableDataSource(tableDataSource) + } + } + + setDataStackLevel(dataStackLevel - 1) + } + + const toPercentString = (percentNumber: number) => { + if (isNaN(percentNumber)) { + return 'N/A' + } + + return `${percentNumber.toFixed(2)}%` + } + + const handleColumnSelectionChange = (value: [key: string]) => { + let columns = value.map((x) => tableSourceColumnMap[x]).flat() + let r = [...baseTableColumns, ...columns] + setSelectedTableColumnsOptions(value) + setSelectedTableColumns(r) + } + + const generateDataSourceFromUnderlyingData = ( + selectedUnderlyingData: ColumnUnderlyingData + ) => { + let tableDataSource: TableRow[] = [] + + for (let i = 0; i < selectedUnderlyingData.leftAggs.length; i++) { + let left = selectedUnderlyingData.leftAggs[i] + let right = selectedUnderlyingData.rightAggs[i] + + let deltaCallsPercentNumber = + ((right.calls - left.calls) / left.calls) * 100 + + let deltaHostDurationPercentNumber = + ((right.host_duration - left.host_duration) / left.host_duration) * 100 + + let deltaSelfHostDurationPercentNumber = + ((right.self_host_duration - left.self_host_duration) / + left.self_host_duration) * + 100 + + let deltaDeviceDurationPercentNumber = + ((right.device_duration - left.device_duration) / + left.device_duration) * + 100 + + let deltaSelfDeviceDurationPercentNumber = + ((right.self_device_duration - left.self_device_duration) / + left.self_device_duration) * + 100 + + tableDataSource.push({ + key: i, + operator: left.name, + baselineCalls: left.calls, + expCalls: right.calls, + deltaCalls: right.calls - left.calls, + deltaCallsPercentNumber: deltaCallsPercentNumber, + deltaCallsPercent: toPercentString(deltaCallsPercentNumber), + + baselineHostDuration: left.host_duration, + expHostDuration: right.host_duration, + deltaHostDuration: right.host_duration - left.host_duration, + deltaHostDurationPercentNumber: deltaHostDurationPercentNumber, + deltaHostDurationPercent: toPercentString( + deltaHostDurationPercentNumber + ), + + baselineSelfHostDuration: left.self_host_duration, + expSelfHostDuration: right.self_host_duration, + deltaSelfHostDuration: + right.self_host_duration - left.self_host_duration, + deltaSelfHostDurationPercentNumber: deltaSelfHostDurationPercentNumber, + deltaSelfHostDurationPercent: toPercentString( + deltaSelfHostDurationPercentNumber + ), + + baselineDeviceDuration: left.device_duration, + expDeviceDuration: right.device_duration, + deltaDeviceDuration: right.device_duration - left.device_duration, + deltaDeviceDurationPercentNumber: deltaDeviceDurationPercentNumber, + deltaDeviceDurationPercent: toPercentString( + deltaDeviceDurationPercentNumber + ), + + baselineSelfDeviceDuration: left.self_device_duration, + expSelfDeviceDuration: right.self_device_duration, + deltaSelfDeviceDuration: + right.self_device_duration - left.self_device_duration, + deltaSelfDeviceDurationPercentNumber: deltaSelfDeviceDurationPercentNumber, + deltaSelfDeviceDurationPercent: toPercentString( + deltaSelfDeviceDurationPercentNumber + ) + }) + } + + return tableDataSource + } + + React.useEffect(() => { + if ( + run.length > 0 && + worker.length > 0 && + span.length > 0 && + expRun.length > 0 && + expWorker.length > 0 && + expSpan.length > 0 + ) { + setLoading(true) + + columnChartDataStack = [] + stepChartDataStack = [] + columnUnderlyingDataStack = [] + columnTableDataSourceStack = [] + + api.defaultApi + .diffnodeGet(run, worker, span, expRun, expWorker, expSpan) + .then((resp) => { + handleDiffNodeResp(resp) + let rootUnderlyingData = { + name: 'rootNode', + path: resp.path, + leftAggs: resp.left.aggs, + rightAggs: resp.right.aggs + } + + setRootUnderlyingData(rootUnderlyingData) + let tableDataSource = generateDataSourceFromUnderlyingData( + rootUnderlyingData! + ) + setTableDataSource(tableDataSource) + }) + .finally(() => setLoading(false)) + + setSelectedTableColumns([...baseTableColumns, ...hostDurationColumns]) + } + }, [run, worker, span, expRun, expWorker, expSpan]) + + const handleDiffNodeResp = (resp: any) => { + let columnChartData: any[] = [] + let stepChartData: any[] = [] + let underlyingData: ColumnUnderlyingData[] = [] + + columnChartData.push([ + 'Call', + 'Baseline', + 'Experiment', + 'Baseline Trend', + 'Exp Trend' + ]) + stepChartData.push(['Call', 'Diff', 'Accumulated Diff']) + + if (resp.children.length > 0) { + let accumulated_left_duration = 0 + let accumulated_right_duration = 0 + let accumulated_step_diff = 0 + for (let i = 0; i < resp.children.length; i++) { + let left = resp.children[i].left + let right = resp.children[i].right + let currColumn: any[] = [] + let currStep: any[] = [] + + let name = left.name + if (name === COMPOSITE_NODES_NAME) { + continue + } + + if (name.startsWith('aten::')) { + // Ignore aten operators + continue + } + + if (name.startsWith('enumerate(DataLoader)')) { + name = name.substring(21) + } + + if (name.startsWith('enumerate(DataPipe)')) { + name = name.substring(19) + } + + if (name.startsWith('nn.Module: ')) { + name = name.substring(11) + } + + if (name.startsWith('Optimizer.zero_grad')) { + name = 'Optimizer.zero_grad' + } + + if (name.startsWith('Optimizer.step')) { + name = 'Optimizer.step' + } + + currColumn.push(name) + currColumn.push(left.total_duration) + currColumn.push(right.total_duration) + + accumulated_left_duration += left.total_duration + currColumn.push(accumulated_left_duration) + + accumulated_right_duration += right.total_duration + currColumn.push(accumulated_right_duration) + columnChartData.push(currColumn) + + underlyingData.push({ + name: name, + path: resp.children[i].path, + leftAggs: left.aggs, + rightAggs: right.aggs + }) + + currStep.push(name) + let stepDiff = right.total_duration - left.total_duration + currStep.push(stepDiff) + + accumulated_step_diff += stepDiff + currStep.push(accumulated_step_diff) + + stepChartData.push(currStep) + } + } else { + let left = resp.left + let right = resp.right + let currColumn: any[] = [] + let currStep: any[] = [] + let name = left.name + + if (name.startsWith('nn.Module: ')) { + name = name.substring(11) + } + + currColumn.push(name) + currColumn.push(left.total_duration) + currColumn.push(right.total_duration) + currColumn.push(left.total_duration) + currColumn.push(right.total_duration) + + columnChartData.push(currColumn) + + currStep.push(name) + let stepDiff = right.total_duration - left.total_duration + currStep.push(stepDiff) + currStep.push(stepDiff) + stepChartData.push(currStep) + } + + setColumnChartData(columnChartData) + columnChartDataStack.push(columnChartData) + + setStepChartData(stepChartData) + stepChartDataStack.push(stepChartData) + + setColumnUnderlyingData(underlyingData) + columnUnderlyingDataStack.push(underlyingData) + + setDataStackLevel(columnChartDataStack.length) + } + + // #endregion + + if (!loading && columnUnderlyingDataStack.length === 0) { + return ( + + + + There is no run selected for diff. + + + ) + } + + if (loading) { + return + } + + return ( +
+ + + + + + + + {columnChartData.length > 1 && ( + <> + + + + )} + {columnChartData.length === 1 && ( + No more level to show. + )} + + + + + + + + + + +   + + + + + + + + ) +} diff --git a/tb_plugins/profiling/tb_plugin/fe/src/components/DistributedView.tsx b/tb_plugins/profiling/tb_plugin/fe/src/components/DistributedView.tsx new file mode 100644 index 0000000000000000000000000000000000000000..f742711b4aa3badbc923bb8a222b8ee74c4208aa --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/components/DistributedView.tsx @@ -0,0 +1,300 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +import Card from '@material-ui/core/Card' +import CardContent from '@material-ui/core/CardContent' +import CardHeader from '@material-ui/core/CardHeader' +import Grid from '@material-ui/core/Grid' +import InputLabel from '@material-ui/core/InputLabel' +import MenuItem from '@material-ui/core/MenuItem' +import Select, { SelectProps } from '@material-ui/core/Select' +import { makeStyles } from '@material-ui/core/styles' +import * as React from 'react' +import * as api from '../api' +import { DistributedGraph, GpuInfo, Graph } from '../api' +import { firstOrUndefined } from '../utils' +import { ColumnChart } from './charts/ColumnChart' +import { TableChart } from './charts/TableChart' +import { DataLoading } from './DataLoading' +import { GpuInfoTable } from './GpuInfoTable' +import { makeChartHeaderRenderer, useTooltipCommonStyles } from './helpers' +import { + DistributedCommopsTableTooltip, + DistributedGpuInfoTableTooltip, + DistributedOverlapGraphTooltip, + DistributedWaittimeGraphTooltip +} from './TooltipDescriptions' + +export interface IProps { + run: string + worker: string + span: string +} + +const useStyles = makeStyles((theme) => ({ + root: { + flexGrow: 1 + }, + verticalInput: { + display: 'flex', + alignItems: 'center' + }, + inputWidth: { + width: '4em' + }, + inputWidthOverflow: { + minWidth: '15em', + whiteSpace: 'nowrap' + }, + description: { + marginLeft: theme.spacing(1) + } +})) + +export const DistributedView: React.FC = (props) => { + const tooltipCommonClasses = useTooltipCommonStyles() + const chartHeaderRenderer = React.useMemo( + () => makeChartHeaderRenderer(tooltipCommonClasses), + [tooltipCommonClasses] + ) + + let { run, worker, span } = props + const classes = useStyles() + + const [overlapGraph, setOverlapGraph] = React.useState< + DistributedGraph | undefined + >(undefined) + const [waittimeGraph, setWaittimeGraph] = React.useState< + DistributedGraph | undefined + >(undefined) + const [commopsTableData, setCommopsTableData] = React.useState< + any | undefined + >(undefined) + const [gpuInfo, setGpuInfo] = React.useState(undefined) + const [commopsTableTitle, setCommopsTableTitle] = React.useState('') + const [commopsWorkers, setCommopsWorkers] = React.useState([]) + const [overlapSteps, setOverlapSteps] = React.useState([]) + const [waittimeSteps, setWaittimeSteps] = React.useState([]) + const [overlapStep, setOverlapStep] = React.useState('') + const [waittimeStep, setWaittimeStep] = React.useState('') + const [commopsWorker, setCommopsWorker] = React.useState('') + + React.useEffect(() => { + if (waittimeSteps.includes('all')) { + setWaittimeStep('all') + } else { + setWaittimeStep(firstOrUndefined(waittimeSteps) ?? '') + } + }, [waittimeSteps]) + + React.useEffect(() => { + if (overlapSteps.includes('all')) { + setOverlapStep('all') + } else { + setOverlapStep(firstOrUndefined(overlapSteps) ?? '') + } + }, [overlapSteps]) + + React.useEffect(() => { + setCommopsWorker(firstOrUndefined(commopsWorkers) ?? '') + }, [commopsWorkers]) + + React.useEffect(() => { + api.defaultApi.distributedOverlapGet(run, 'All', span).then((resp) => { + setOverlapGraph(resp) + setOverlapSteps(Object.keys(resp.data)) + }) + api.defaultApi.distributedWaittimeGet(run, 'All', span).then((resp) => { + setWaittimeGraph(resp) + setWaittimeSteps(Object.keys(resp.data)) + }) + api.defaultApi.distributedCommopsGet(run, 'All', span).then((resp) => { + setCommopsTableData(resp.data) + setCommopsWorkers(Object.keys(resp.data)) + setCommopsTableTitle(resp.metadata.title) + }) + api.defaultApi.distributedGpuinfoGet(run, 'All', span).then((resp) => { + setGpuInfo(resp) + }) + }, [run, worker, span]) + + const onCommopsWorkerChanged: SelectProps['onChange'] = (event) => { + setCommopsWorker(event.target.value as string) + } + + const onOverlapStepChanged: SelectProps['onChange'] = (event) => { + setOverlapStep(event.target.value as string) + } + + const onWaittimeStepChanged: SelectProps['onChange'] = (event) => { + setWaittimeStep(event.target.value as string) + } + + const getColumnChartData = ( + distributedGraph?: DistributedGraph, + step?: string + ) => { + if (!distributedGraph || !step) return undefined + const barLabels = Object.keys(distributedGraph.data[step]) + return { + legends: distributedGraph.metadata.legends, + barLabels, + barHeights: barLabels.map((label) => distributedGraph.data[step][label]) + } + } + const overlapData = React.useMemo( + () => getColumnChartData(overlapGraph, overlapStep), + [overlapGraph, overlapStep] + ) + const waittimeData = React.useMemo( + () => getColumnChartData(waittimeGraph, waittimeStep), + [waittimeGraph, waittimeStep] + ) + + const getTableData = (tableData?: any, worker?: string) => { + if (!tableData || !worker) return undefined + return tableData[worker] as Graph + } + const commopsTable = getTableData(commopsTableData, commopsWorker) + + return ( +
+ + + + + {gpuInfo && ( + + + + + + + + + )} + + + {(chartData) => ( + + + + + Step + + + + + + + {overlapGraph?.metadata?.title && ( + + )} + + + )} + + + + + + {(chartData) => ( + + + + + Step + + + + + + + {waittimeGraph?.metadata?.title && ( + + )} + + + )} + + + + + + + + + + Worker + + + + + + + + + + {(graph) => } + + + + + + + +
+ ) +} diff --git a/tb_plugins/profiling/tb_plugin/fe/src/components/FullCircularProgress.tsx b/tb_plugins/profiling/tb_plugin/fe/src/components/FullCircularProgress.tsx new file mode 100644 index 0000000000000000000000000000000000000000..5212bd74bf9739cc171d369e6591a0c26f058f6a --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/components/FullCircularProgress.tsx @@ -0,0 +1,23 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ +import CircularProgress from '@material-ui/core/CircularProgress' +import { makeStyles } from '@material-ui/core/styles' +import * as React from 'react' + +const useStyles = makeStyles(() => ({ + root: { + width: '100%', + display: 'flex', + justifyContent: 'center' + } +})) + +export const FullCircularProgress: React.FC = () => { + const classes = useStyles() + return ( +
+ +
+ ) +} diff --git a/tb_plugins/profiling/tb_plugin/fe/src/components/GpuInfoTable.tsx b/tb_plugins/profiling/tb_plugin/fe/src/components/GpuInfoTable.tsx new file mode 100644 index 0000000000000000000000000000000000000000..4c624db0580caa466271e56505f2838637705884 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/components/GpuInfoTable.tsx @@ -0,0 +1,134 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +import { makeStyles } from '@material-ui/core/styles' +import * as React from 'react' + +export interface IProps { + gpuInfo: any +} + +const useStyles = makeStyles((theme) => ({ + root: { + border: '1px solid #E0E0E0', + borderCollapse: 'collapse', + width: '100%' + }, + td: { + borderTop: '1px solid #E0E0E0', + borderBottom: '1px solid #E0E0E0', + borderCollapse: 'collapse', + paddingLeft: 10, + paddingRight: 10 + }, + nodeTd: { + fontWeight: 'bold' + }, + pidTd: { + fontWeight: 'normal' + }, + gpuTd: { + fontWeight: 'normal' + }, + keyTd: { + fontWeight: 'normal', + textAlign: 'right' + }, + valueTd: { + fontWeight: 'bold' + } +})) + +interface TableCellInfo { + content: string + rowspan: number + cellType: 'node' | 'pid' | 'gpu' | 'key' | 'value' + last?: boolean +} + +function makeTableCellInfo(gpuInfo: any): TableCellInfo[][] { + const rows: TableCellInfo[][] = [] + let curr_row: TableCellInfo[] = [] + rows.push(curr_row) + Object.keys(gpuInfo.data).forEach(function (node_name) { + const node_cell = { + content: node_name, + rowspan: 0, + cellType: 'node' as const + } + const i = rows.length + curr_row.push(node_cell) + Object.keys(gpuInfo.data[node_name]).forEach(function (pid) { + const pid_cell = { content: pid, rowspan: 0, cellType: 'pid' as const } + const i = rows.length + curr_row.push(pid_cell) + Object.keys(gpuInfo.data[node_name][pid]).forEach(function (gpu) { + const gpu_cell = { content: gpu, rowspan: 0, cellType: 'gpu' as const } + const i = rows.length + curr_row.push(gpu_cell) + Object.keys(gpuInfo.data[node_name][pid][gpu]).forEach(function ( + key_name + ) { + curr_row.push({ + content: key_name, + rowspan: 1, + cellType: 'key' as const + }) + const value: string = gpuInfo.data[node_name][pid][gpu][key_name] + curr_row.push({ + content: value, + rowspan: 1, + cellType: 'value' as const + }) + curr_row = [] + rows.push(curr_row) + }) + gpu_cell.rowspan = rows.length - i + }) + pid_cell.rowspan = rows.length - i + }) + node_cell.rowspan = rows.length - i + }) + rows.pop() + return rows +} + +export const GpuInfoTable: React.FC = (props) => { + const classes = useStyles() + interface TableCellInfo { + content: string + rowspan: number + cellType: 'node' | 'pid' | 'gpu' | 'key' | 'value' + } + + const rows = React.useMemo(() => makeTableCellInfo(props.gpuInfo), [ + props.gpuInfo + ]) + + const cellToClass = { + node: classes.nodeTd, + pid: classes.pidTd, + gpu: classes.gpuTd, + key: classes.keyTd, + value: classes.valueTd + } + + const renderCell = function (info: TableCellInfo) { + let cellClass = cellToClass[info.cellType] + let content = info.cellType == 'key' ? info.content + ':' : info.content + return ( +
+ ) + } + + return ( +
+ {content} +
+ {rows.map((row) => ( + {row.map(renderCell)} + ))} +
+ ) +} diff --git a/tb_plugins/profiling/tb_plugin/fe/src/components/Kernel.tsx b/tb_plugins/profiling/tb_plugin/fe/src/components/Kernel.tsx new file mode 100644 index 0000000000000000000000000000000000000000..0ad17faebcfa710572be7ad6853dc9b309448bad --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/components/Kernel.tsx @@ -0,0 +1,282 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +import Card from '@material-ui/core/Card' +import CardContent from '@material-ui/core/CardContent' +import CardHeader from '@material-ui/core/CardHeader' +import FormControlLabel from '@material-ui/core/FormControlLabel' +import Grid from '@material-ui/core/Grid' +import InputLabel from '@material-ui/core/InputLabel' +import MenuItem from '@material-ui/core/MenuItem' +import Radio from '@material-ui/core/Radio' +import RadioGroup, { RadioGroupProps } from '@material-ui/core/RadioGroup' +import Select, { SelectProps } from '@material-ui/core/Select' +import { makeStyles } from '@material-ui/core/styles' +import TextField, { + StandardTextFieldProps, + TextFieldProps +} from '@material-ui/core/TextField' +import * as React from 'react' +import * as api from '../api' +import { Graph } from '../api' +import { KernelGroupBy } from '../constants/groupBy' +import { useSearch } from '../utils/search' +import { topIsValid, UseTop, useTopN } from '../utils/top' +import { AntTableChart } from './charts/AntTableChart' +import { PieChart } from './charts/PieChart' +import { DataLoading } from './DataLoading' +import { makeChartHeaderRenderer, useTooltipCommonStyles } from './helpers' +import { + GPUKernelTotalTimeTooltip, + TensorCoresPieChartTooltip +} from './TooltipDescriptions' + +export interface IProps { + run: string + worker: string + span: string +} + +const useStyles = makeStyles((theme) => ({ + root: { + flexGrow: 1 + }, + verticalInput: { + display: 'flex', + alignItems: 'center' + }, + inputWidth: { + width: '4em' + }, + inputWidthOverflow: { + minWidth: '15em', + whiteSpace: 'nowrap' + }, + description: { + marginLeft: theme.spacing(1) + } +})) + +export const Kernel: React.FC = (props) => { + const { run, worker, span } = props + const classes = useStyles() + const tooltipCommonClasses = useTooltipCommonStyles() + const chartHeaderRenderer = React.useMemo( + () => makeChartHeaderRenderer(tooltipCommonClasses), + [tooltipCommonClasses] + ) + + const [kernelGraph, setKernelGraph] = React.useState( + undefined + ) + const [tcGraph, setTcGraph] = React.useState(undefined) + const [kernelTable, setKernelTable] = React.useState( + undefined + ) + const [groupBy, setGroupBy] = React.useState(KernelGroupBy.Kernel) + const [searchKernelName, setSearchKernelName] = React.useState('') + const [searchOpName, setSearchOpName] = React.useState('') + const [sortColumn, setSortColumn] = React.useState('') + + const [topText, actualTop, useTop, setTopText, setUseTop] = useTopN({ + defaultUseTop: UseTop.Use, + defaultTop: 10 + }) + + React.useEffect(() => { + setSearchOpName('') + }, [groupBy]) + + React.useEffect(() => { + if (kernelGraph) { + setTopText(String(Math.min(kernelGraph.rows?.length, 10))) + } + }, [kernelGraph]) + + React.useEffect(() => { + api.defaultApi.kernelTableGet(run, worker, span, groupBy).then((resp) => { + setSortColumn(resp.metadata.sort) + setKernelTable(resp.data) + }) + }, [run, worker, span, groupBy]) + + React.useEffect(() => { + api.defaultApi + .kernelGet(run, worker, span, KernelGroupBy.Kernel) + .then((resp) => { + setKernelGraph(resp.total) + }) + }, [run, worker, span]) + + React.useEffect(() => { + api.defaultApi.kernelTcPieGet(run, worker, span).then((resp) => { + setTcGraph(resp.total) + }) + }, [run, worker, span]) + + const [searchedKernelTable] = useSearch(searchKernelName, 'name', kernelTable) + const [searchedOpTable] = useSearch( + searchOpName, + 'operator', + searchedKernelTable + ) + + const onGroupByChanged: SelectProps['onChange'] = (event) => { + setGroupBy(event.target.value as KernelGroupBy) + } + + const onSearchKernelChanged: TextFieldProps['onChange'] = (event) => { + setSearchKernelName(event.target.value as string) + } + + const onSearchOpChanged: TextFieldProps['onChange'] = (event) => { + setSearchOpName(event.target.value as string) + } + + const onUseTopChanged: RadioGroupProps['onChange'] = (event) => { + setUseTop(event.target.value as UseTop) + } + + const onTopChanged = (event: React.ChangeEvent) => { + setTopText(event.target.value) + } + + const inputProps: StandardTextFieldProps['inputProps'] = { + min: 1 + } + + const GPUKernelTotalTimeTitle = React.useMemo( + () => chartHeaderRenderer('Total Time (us)', GPUKernelTotalTimeTooltip), + [chartHeaderRenderer] + ) + + const TensorCoresTitle = React.useMemo( + () => + chartHeaderRenderer( + 'Tensor Cores Utilization', + TensorCoresPieChartTooltip + ), + [chartHeaderRenderer] + ) + + return ( +
+ + + + + + + + } + label="All kernels" + /> + } + label="Top kernels to show" + /> + + + {useTop === UseTop.Use && ( + + + + )} + + + + {(graph) => ( + + + + + )} + + + + + {(graph) => ( + + + + + )} + + + + + + + Group By + + + + + + + + {groupBy === KernelGroupBy.KernelNameAndOpName && ( + + + + )} + + + + + {(graph) => ( + + )} + + + + + + +
+ ) +} diff --git a/tb_plugins/profiling/tb_plugin/fe/src/components/MemoryView.tsx b/tb_plugins/profiling/tb_plugin/fe/src/components/MemoryView.tsx new file mode 100644 index 0000000000000000000000000000000000000000..fb3c90c3bcef77c6e110aa2c3e6b38234f464b7d --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/components/MemoryView.tsx @@ -0,0 +1,465 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +import Card from '@material-ui/core/Card' +import CardContent from '@material-ui/core/CardContent' +import CardHeader from '@material-ui/core/CardHeader' +import Grid from '@material-ui/core/Grid' +import InputLabel from '@material-ui/core/InputLabel' +import MenuItem from '@material-ui/core/MenuItem' +import Select, { SelectProps } from '@material-ui/core/Select' +import Slider from '@material-ui/core/Slider' +import { makeStyles } from '@material-ui/core/styles' +import TextField, { TextFieldProps } from '@material-ui/core/TextField' +import * as React from 'react' +import * as api from '../api' +import { + Graph, + MemoryCurveData, + MemoryEventsData, + MemoryStatsData +} from '../api' +import { useSearchDirectly } from '../utils/search' +import { AntTableChart } from './charts/AntTableChart' +import { LineChart } from './charts/LineChart' +import { DataLoading } from './DataLoading' +import { MemoryStatsTable } from './tables/MemoryStatsTable' + +const useStyles = makeStyles((theme) => ({ + root: { + flexGrow: 1 + }, + curve: { + marginBottom: 20 + }, + verticalInput: { + display: 'flex', + alignItems: 'center' + }, + inputWidth: { + width: '4em' + }, + inputWidthOverflow: { + minWidth: '15em', + whiteSpace: 'nowrap' + }, + full: { + width: '100%' + }, + description: { + marginLeft: theme.spacing(1) + }, + filterSlider: { + marginTop: 15, + marginRight: 6, + width: 250 + }, + filterInput: { + width: 100 + } +})) + +export interface IProps { + run: string + worker: string + span: string +} + +export const MemoryView: React.FC = React.memo((props) => { + interface EventSizeFilter { + [deviceName: string]: Array + } + + interface MaxEventSize { + [deviceName: string]: number + } + + const { run, worker, span } = props + const classes = useStyles() + + const [memoryStatsData, setMemoryStatsData] = React.useState< + MemoryStatsData | undefined + >(undefined) + + // for backward compatability, old profile do not have events to show + const showEvents = () => { + return memoryEventsData && Object.keys(memoryEventsData.rows).length != 0 + } + const [memoryEventsData, setMemoryEventsData] = React.useState< + MemoryEventsData | undefined + >(undefined) + + // for backward compatability, old profile do not have curve to show + const showCurve = () => { + return memoryCurveData && Object.keys(memoryCurveData.rows).length != 0 + } + const [memoryCurveData, setMemoryCurveData] = React.useState< + MemoryCurveData | undefined + >(undefined) + + const [lineChartData, setLineChartData] = React.useState( + undefined + ) + + const [devices, setDevices] = React.useState([]) + const [device, setDevice] = React.useState('') + interface SelectedRange { + start: number + end: number + startTs: number + endTs: number + } + const [selectedRange, setSelectedRange] = React.useState< + SelectedRange | undefined + >() + const [searchOperatorName, setSearchOperatorName] = React.useState('') + const [searchEventOperatorName, setSearchEventOperatorName] = React.useState( + '' + ) + const [filterEventSize, setFilterEventSize] = React.useState( + {} + ) + const [maxSize, setMaxSize] = React.useState({}) + + const getSearchIndex = function () { + if (!memoryStatsData) { + return -1 + } + for (let i = 0; i < memoryStatsData.columns.length; i++) { + if (memoryStatsData.columns[i].name == memoryStatsData.metadata.search) { + return i + } + } + return -1 + } + + const getStep = (size: number, indexBias: number) => { + return 10 ** (Math.floor(Math.log10(size != 0 ? size : 1)) - indexBias) + } + + const filterByEventSize = ( + rows: T[] | undefined, + size: Array + ) => { + const result = React.useMemo(() => { + if (!rows) { + return undefined + } + + // workaround type system + const field = (row: any): number => { + const sizeColIndex = 1 + return row[sizeColIndex] + } + + return rows.filter((row) => { + return field(row) >= size[0] && field(row) <= size[1] + }) + }, [rows, size]) + + return result + } + + const searchIndex = getSearchIndex() + const getName = React.useCallback((row: any) => row[searchIndex], [ + searchIndex + ]) + const [searchedTableDataRows] = useSearchDirectly( + searchOperatorName, + getName, + memoryStatsData?.rows[device] ?? [] + ) + const [searchedEventsTableDataRows] = useSearchDirectly( + searchEventOperatorName, + getName, + filterByEventSize( + memoryEventsData?.rows[device], + filterEventSize[device] ?? [0, Infinity] + ) ?? [] + ) + + const onSearchOperatorChanged: TextFieldProps['onChange'] = (event) => { + setSearchOperatorName(event.target.value as string) + } + + const onSearchEventOperatorChanged: TextFieldProps['onChange'] = (event) => { + setSearchEventOperatorName(event.target.value as string) + } + + const [selectedRecord, setSelectedRecord] = React.useState() + const onRowSelected = (record?: object, rowIndex?: number) => { + setSelectedRecord(record) + } + + const onFilterEventSizeChanged = ( + event: any, + newValue: number | number[] + ) => { + setFilterEventSize({ + ...filterEventSize, + [device]: newValue as number[] + }) + } + + const onFilterEventMinSizeInputChanged = ( + event: React.ChangeEvent + ) => { + setFilterEventSize({ + ...filterEventSize, + [device]: [Number(event.target.value), filterEventSize[device][1]] + }) + } + + const onFilterEventMaxSizeInputChanged = ( + event: React.ChangeEvent + ) => { + setFilterEventSize({ + ...filterEventSize, + [device]: [filterEventSize[device][0], Number(event.target.value)] + }) + } + + React.useEffect(() => { + api.defaultApi + .memoryGet( + run, + worker, + span, + selectedRange?.startTs, + selectedRange?.endTs + ) + .then((resp) => { + setMemoryStatsData(resp) + if (!devices || devices.length == 0) { + // setDevices only execute on view load. Since selection on curve + // might filter all events later, some devices might is missing. + setDevices(Object.keys(resp.rows)) + setDevice(resp.metadata.default_device) + } + }) + }, [run, worker, span, selectedRange]) + + React.useEffect(() => { + api.defaultApi + .memoryEventsGet( + run, + worker, + span, + selectedRange?.startTs, + selectedRange?.endTs + ) + .then((resp) => { + let curMaxSize: MaxEventSize = {} + let curFilterEventSize: EventSizeFilter = {} + for (let deviceName in resp.rows) { + curMaxSize[deviceName] = 0 + for (let i = 0; i < resp.rows[deviceName].length; i++) { + curMaxSize[deviceName] = Math.max( + curMaxSize[deviceName], + resp.rows[deviceName][i][1] + ) + } + curFilterEventSize[deviceName] = [ + curMaxSize[deviceName] / 4, + curMaxSize[deviceName] + ] + curMaxSize[deviceName] = curMaxSize[deviceName] + } + setMaxSize(curMaxSize) + setFilterEventSize(curFilterEventSize) + setMemoryEventsData(resp) + }) + }, [run, worker, span, selectedRange]) + + React.useEffect(() => { + api.defaultApi.memoryCurveGet(run, worker, span).then((resp) => { + setMemoryCurveData(resp) + // Reset the select range to null whenever run/worker/span changes + setSelectedRange(undefined) + }) + }, [run, worker, span]) + + React.useEffect(() => { + if (memoryCurveData !== undefined) { + setLineChartData({ + title: memoryCurveData.metadata.peaks[device], + columns: memoryCurveData.columns, + rows: memoryCurveData.rows[device] ?? [] + }) + } + }, [memoryCurveData, device]) + + const onDeviceChanged: SelectProps['onChange'] = (event) => { + setDevice(event.target.value as string) + setSelectedRange(undefined) + } + + const onSelectedRangeChanged = (start: number, end: number) => { + let bias = memoryCurveData?.metadata.first_ts ?? 0 + let scale = 1 / (memoryCurveData?.metadata.time_factor ?? 1) + let startTs = Math.round(start * scale + bias) + let endTs = Math.round(end * scale + bias) + if (startTs == endTs) { + setSelectedRange(undefined) + return + } + setSelectedRange({ start, end, startTs, endTs }) + } + + return ( +
+ + + + + + + {(graph) => ( + + + Device + + + {showCurve() && lineChartData && ( + +
+ +
+
+ )} +
+ )} +
+
+ {showEvents() && ( + <> + + + + + + + + + + + + + + + + + + + + + + + {(data) => { + return ( + + ) + }} + + + + )} + <> + + + + + + + + {(data) => ( + + )} + + + +
+
+
+
+ ) +}) diff --git a/tb_plugins/profiling/tb_plugin/fe/src/components/ModuleView.tsx b/tb_plugins/profiling/tb_plugin/fe/src/components/ModuleView.tsx new file mode 100644 index 0000000000000000000000000000000000000000..e28c29f3fd5e71c93070e675f839e5dd26c56568 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/components/ModuleView.tsx @@ -0,0 +1,263 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ +import Card from '@material-ui/core/Card' +import CardHeader from '@material-ui/core/CardHeader' +import InputLabel from '@material-ui/core/InputLabel' +import MenuItem from '@material-ui/core/MenuItem' +import Select, { SelectProps } from '@material-ui/core/Select' +import { makeStyles } from '@material-ui/core/styles' +import { Table } from 'antd' +import * as React from 'react' +import { FlameGraph } from 'react-flame-graph' +import { + defaultApi, + KeyedColumn, + ModuleStats, + ModuleViewData, + OperatorNode +} from '../api' + +const useStyles = makeStyles((theme) => ({ + root: { + flexGrow: 1 + }, + hide: { + display: 'none' + } +})) + +export interface IProps { + run: string + worker: string + span: string +} + +const getKeyedTableColumns = (columns: KeyedColumn[]) => { + return columns.map((col) => { + return { + dataIndex: col.key, + key: col.key, + title: col.name + } + }) +} + +const getTableRows = (key: number, rows: ModuleStats[]) => { + return rows.map((row) => { + const data: any = { + key: key++, + name: row.name, + occurences: row.occurences, + operators: row.operators, + host_duration: row.host_duration, + self_host_duration: row.self_host_duration, + device_duration: row.device_duration, + self_device_duration: row.self_device_duration + } + + if (row.children.length) { + data.children = getTableRows(key, row.children) + } + + return data + }) +} + +const getFlameGraphData = (rows: ModuleStats[]) => { + return rows.map((row) => { + const data: any = { + name: row.name, + value: row.avg_duration, + tooltip: `${row.name} (module id: ${row.id}): ${row.avg_duration} us` + } + + if (row.children.length) { + data.children = getFlameGraphData(row.children) + } + + return data + }) +} + +const getTreeHeight = (row: ModuleStats): number => { + if (row.children && row.children.length) { + return 1 + Math.max(...row.children.map((child) => getTreeHeight(child))) + } else { + return 1 + } +} + +const getOperatorTree = ( + level: number, + row: OperatorNode, + result: object[] +) => { + result.push({ + level: level, + name: row.name, + start: row.start_time, + end: row.end_time + }) + if (row.children.length) { + row.children.forEach((child) => getOperatorTree(level + 1, child, result)) + } +} + +export const ModuleView: React.FC = (props) => { + const { run, worker, span } = props + const classes = useStyles() + + const [moduleView, setModuleView] = React.useState< + ModuleViewData | undefined + >(undefined) + const [flameData, setFlameData] = React.useState([]) + const [flameHeight, setFlameHeight] = React.useState(0) + const [modules, setModules] = React.useState([]) + const [module, setModule] = React.useState(0) + + const [columns, setColumns] = React.useState([]) + const [rows, setRows] = React.useState([]) + + const cardRef = React.useRef(null) + const [cardWidth, setCardWidth] = React.useState( + undefined + ) + const timelineRef = React.useRef(null) + + React.useEffect(() => { + defaultApi + .moduleGet(run, worker, span) + .then((resp) => { + setModuleView(resp) + if (resp) { + // set the flamegraph data + const flameData: any[] = getFlameGraphData(resp.data) + setFlameData(flameData) + const flameHeight = Math.max( + ...flameData.map((x) => getTreeHeight(x)) + ) + setFlameHeight(flameHeight * 25) + setModules(Array.from(Array(flameData.length).keys())) + setModule(0) + + // set the tree table data + setColumns(getKeyedTableColumns(resp.columns)) + setRows(getTableRows(1, resp.data)) + } + }) + .catch((e) => { + if (e.status == 404) { + setModules([]) + setFlameData([]) + setRows([]) + } + }) + + if (cardRef.current) { + setCardWidth(cardRef.current.offsetWidth - 10) + } + if (timelineRef.current) { + defaultApi.treeGet(run, worker, span).then((resp) => { + if (resp) { + const data = new google.visualization.DataTable() + data.addColumn({ type: 'string', id: 'Layer' }) + data.addColumn({ type: 'string', id: 'Name' }) + data.addColumn({ type: 'string', role: 'tooltip' }) + data.addColumn({ type: 'number', id: 'Start' }) + data.addColumn({ type: 'number', id: 'End' }) + + let timeline_data: any[] = [] + getOperatorTree(0, resp, timeline_data) + timeline_data.sort((a, b) => a.level - b.level) + const max_level = timeline_data[timeline_data.length - 1].level + timeline_data.forEach((d) => { + data.addRow([ + d.level.toString(), + d.name, + `${d.name} Duration: ${d.end - d.start} us`, + d.start / 1000.0, // the time unit is us returned from server, but the google charts only accept milliseconds here + d.end / 1000.0 + ]) + }) + + const chart = new google.visualization.Timeline(timelineRef.current) + + // console.info(timeline_data) + const options = { + height: (max_level + 1) * 50, + tooltip: { + isHtml: true + }, + timeline: { + showRowLabels: false + } + } + chart.draw(data, options) + } + }) + } + }, [run, worker, span]) + + const handleModuleChange: SelectProps['onChange'] = (event) => { + setModule(event.target.value as number) + } + + const moduleComponent = () => { + const moduleFragment = ( + + Module + + + ) + + if (!modules || modules.length <= 1) { + return
{moduleFragment}
+ } else { + return moduleFragment + } + } + + return ( +
+ + + + {/* defaultExpandAllRows will only valid when first render the Table + if row is null, then it will be ignored so all data will be collapse. + see https://segmentfault.com/a/1190000007830998 for more information. + */} + {rows && rows.length > 0 && ( + + )} + + {moduleComponent()} + + {flameData && flameData.length > 0 && ( + { + console.log(`"${node.name}" focused`) + }} + /> + )} + +
+ +
+ ) +} diff --git a/tb_plugins/profiling/tb_plugin/fe/src/components/Operator.tsx b/tb_plugins/profiling/tb_plugin/fe/src/components/Operator.tsx new file mode 100644 index 0000000000000000000000000000000000000000..c1f16aaf4e9d085a400614d2a3b0976107c3ecca --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/components/Operator.tsx @@ -0,0 +1,308 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +import Card from '@material-ui/core/Card' +import CardContent from '@material-ui/core/CardContent' +import CardHeader from '@material-ui/core/CardHeader' +import FormControlLabel from '@material-ui/core/FormControlLabel' +import Grid from '@material-ui/core/Grid' +import GridList from '@material-ui/core/GridList' +import GridListTile from '@material-ui/core/GridListTile' +import InputLabel from '@material-ui/core/InputLabel' +import MenuItem from '@material-ui/core/MenuItem' +import Radio from '@material-ui/core/Radio' +import RadioGroup, { RadioGroupProps } from '@material-ui/core/RadioGroup' +import Select, { SelectProps } from '@material-ui/core/Select' +import { makeStyles } from '@material-ui/core/styles' +import TextField, { + StandardTextFieldProps, + TextFieldProps +} from '@material-ui/core/TextField' +import * as React from 'react' +import * as api from '../api' +import { + OperationTableData, + OperationTableDataInner, + OperatorGraph +} from '../api' +import { OperationGroupBy } from '../constants/groupBy' +import { useSearchDirectly } from '../utils/search' +import { topIsValid, UseTop, useTopN } from '../utils/top' +import { PieChart } from './charts/PieChart' +import { DataLoading } from './DataLoading' +import { makeChartHeaderRenderer, useTooltipCommonStyles } from './helpers' +import { OperationTable } from './tables/OperationTable' +import { + DeviceSelfTimeTooltip, + DeviceTotalTimeTooltip, + HostSelfTimeTooltip, + HostTotalTimeTooltip +} from './TooltipDescriptions' + +const useStyles = makeStyles((theme) => ({ + root: { + flexGrow: 1 + }, + verticalInput: { + display: 'flex', + alignItems: 'center' + }, + inputWidth: { + width: '4em' + }, + inputWidthOverflow: { + minWidth: '15em', + whiteSpace: 'nowrap' + }, + full: { + width: '100%' + }, + description: { + marginLeft: theme.spacing(1) + } +})) + +export interface IProps { + run: string + worker: string + span: string +} + +export const Operator: React.FC = (props) => { + const { run, worker, span } = props + const classes = useStyles() + const tooltipCommonClasses = useTooltipCommonStyles() + const chartHeaderRenderer = React.useMemo( + () => makeChartHeaderRenderer(tooltipCommonClasses), + [tooltipCommonClasses] + ) + + const [operatorGraph, setOperatorGraph] = React.useState< + OperatorGraph | undefined + >(undefined) + const [operatorTable, setOperatorTable] = React.useState< + OperationTableData | undefined + >(undefined) + const [sortColumn, setSortColumn] = React.useState('') + const [tableTooltips, setTableTooltips] = React.useState( + undefined + ) + const [groupBy, setGroupBy] = React.useState(OperationGroupBy.Operation) + const [searchOperatorName, setSearchOperatorName] = React.useState('') + const [topText, actualTop, useTop, setTopText, setUseTop] = useTopN({ + defaultUseTop: UseTop.Use, + defaultTop: 10 + }) + + const getName = React.useCallback( + (row: OperationTableDataInner) => row.name, + [] + ) + const [searchedOperatorTable] = useSearchDirectly( + searchOperatorName, + getName, + operatorTable + ) + + const onSearchOperatorChanged: TextFieldProps['onChange'] = (event) => { + setSearchOperatorName(event.target.value as string) + } + + React.useEffect(() => { + if (operatorGraph) { + const counts = [ + operatorGraph.device_self_time?.rows.length ?? 0, + operatorGraph.device_total_time?.rows.length ?? 0, + operatorGraph.host_self_time.rows?.length ?? 0, + operatorGraph.host_total_time.rows?.length ?? 0 + ] + setTopText(String(Math.min(Math.max(...counts), 10))) + } + }, [operatorGraph]) + + React.useEffect(() => { + api.defaultApi + .operationTableGet(run, worker, span, groupBy) + .then((resp) => { + setSortColumn(resp.metadata.sort) + setTableTooltips(resp.metadata.tooltips) + setOperatorTable(resp.data) + }) + }, [run, worker, span, groupBy]) + + React.useEffect(() => { + api.defaultApi + .operationGet(run, worker, span, OperationGroupBy.Operation) + .then((resp) => { + setOperatorGraph(resp) + }) + }, [run, worker, span]) + + const onGroupByChanged: SelectProps['onChange'] = (event) => { + setGroupBy(event.target.value as OperationGroupBy) + } + + const onUseTopChanged: RadioGroupProps['onChange'] = (event) => { + setUseTop(event.target.value as UseTop) + } + + const onTopChanged = (event: React.ChangeEvent) => { + setTopText(event.target.value) + } + + const inputProps: StandardTextFieldProps['inputProps'] = { + min: 1 + } + + const renderCharts = (graph: api.OperatorGraph) => { + return ( + + {graph.device_self_time && ( + + + {graph.device_self_time.title && ( + + )} + + + + )} + {graph.device_total_time && ( + + + {graph.device_total_time.title && ( + + )} + + + + )} + + + {graph.host_self_time.title && ( + + )} + + + + + + {graph.host_total_time.title && ( + + )} + + + + + ) + } + + return ( +
+ + + + + + + + } + label="All operators" + /> + } + label="Top operators to show" + /> + + + {useTop === UseTop.Use && ( + + + + )} + + + {renderCharts} + + + + + + Group By + + + + + + + + + + {(table) => ( + + )} + + + + + + +
+ ) +} diff --git a/tb_plugins/profiling/tb_plugin/fe/src/components/Overview.tsx b/tb_plugins/profiling/tb_plugin/fe/src/components/Overview.tsx new file mode 100644 index 0000000000000000000000000000000000000000..c9d16bf9580abbc7e226fbd5110ac8bd71f354ed --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/components/Overview.tsx @@ -0,0 +1,220 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +import Card from '@material-ui/core/Card' +import CardContent from '@material-ui/core/CardContent' +import CardHeader from '@material-ui/core/CardHeader' +import Grid from '@material-ui/core/Grid' +import { makeStyles } from '@material-ui/core/styles' +import * as React from 'react' +import * as api from '../api' +import { PieChart } from './charts/PieChart' +import { SteppedAreaChart } from './charts/SteppedAreaChart' +import { TableChart } from './charts/TableChart' +import { DataLoading } from './DataLoading' +import { makeChartHeaderRenderer, useTooltipCommonStyles } from './helpers' +import { TextListItem } from './TextListItem' +import { StepTimeBreakDownTooltip } from './TooltipDescriptions' +import { + transformPerformanceIntoPie, + transformPerformanceIntoTable +} from './transform' + +const topGraphHeight = 230 + +const useStyles = makeStyles((theme) => ({ + root: { + flexGrow: 1 + }, + pre: { + '& ul': { + margin: 0, + paddingLeft: theme.spacing(3), + ...theme.typography.body1 + }, + '& li': {}, + '& a': { + color: '#ffa726' + }, + '& a:active': { + color: '#ffa726' + }, + '& p': { + margin: 0, + ...theme.typography.subtitle1, + fontWeight: theme.typography.fontWeightBold + } + }, + topGraph: { + height: topGraphHeight + 40 + } +})) + +const highlightNoTopLevel = ( + row: number, + column: number, + cb: (key: string, value: any) => void +) => { + if (row !== 0) { + cb('style', 'background: #e0e0e0') + } +} + +export interface IProps { + run: string + worker: string + span: string +} + +export const Overview: React.FC = (props) => { + const { run, worker, span } = props + + const [steps, setSteps] = React.useState(undefined) + const [performances, setPerformances] = React.useState([]) + const [environments, setEnvironments] = React.useState([]) + const [gpuMetrics, setGpuMetrics] = React.useState< + api.GpuMetrics | undefined + >(undefined) + const [recommendations, setRecommendations] = React.useState('') + + const synthesizedTableGraph = React.useMemo(() => { + return transformPerformanceIntoTable(performances) + }, [performances]) + + const synthesizedPieGraph = React.useMemo(() => { + return transformPerformanceIntoPie(performances) + }, [performances]) + + React.useEffect(() => { + api.defaultApi.overviewGet(run, worker, span).then((resp) => { + setPerformances(resp.performance) + setEnvironments(resp.environments) + setSteps(resp.steps) + setRecommendations(resp.recommendations) + setGpuMetrics(resp.gpu_metrics) + console.log(resp.gpu_metrics) + }) + }, [run, worker, span]) + + const classes = useStyles() + const tooltipCommonClasses = useTooltipCommonStyles() + const chartHeaderRenderer = React.useMemo( + () => makeChartHeaderRenderer(tooltipCommonClasses, false), + [tooltipCommonClasses] + ) + + const stepTimeBreakDownTitle = React.useMemo( + () => chartHeaderRenderer('Step Time Breakdown', StepTimeBreakDownTooltip), + [tooltipCommonClasses, chartHeaderRenderer] + ) + + const cardSizes = gpuMetrics + ? ([2, 3, 7] as const) + : ([4, undefined, 8] as const) + + return ( +
+ + + + {React.useMemo( + () => ( + + + + {environments.map((environment) => ( + + ))} + + + ), + [environments] + )} + + {gpuMetrics && ( + + + + + {gpuMetrics.data.map((metric) => ( + + ))} + + + + )} + + + + + + + + + + + + + + + + + + + + + + + {(graph) => ( + + )} + + + + + + + + + + +
+
+
+ + + + + +
+ ) +} diff --git a/tb_plugins/profiling/tb_plugin/fe/src/components/TextListItem.tsx b/tb_plugins/profiling/tb_plugin/fe/src/components/TextListItem.tsx new file mode 100644 index 0000000000000000000000000000000000000000..c5e4eee5251f7ab8afedf58f305a5cb30ad92a19 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/components/TextListItem.tsx @@ -0,0 +1,89 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +import Grid from '@material-ui/core/Grid' +import { makeStyles } from '@material-ui/core/styles' +import * as React from 'react' + +export interface IStylesProps { + root?: string + name?: string +} + +export interface IProps { + name: string + value?: string + description?: string + extra?: string + classes?: IStylesProps + dangerouslyAllowHtml?: boolean +} + +const useStyles = makeStyles((theme) => ({ + label: { + ...theme.typography.subtitle2, + fontWeight: 'bolder' + }, + value: { + textAlign: 'right', + ...theme.typography.subtitle2, + fontWeight: 'bolder' + } +})) + +export const TextListItem: React.FC = (props) => { + const classes = useStyles() + + const getSizes = function () { + if (props.value && props.extra) { + return [4, 4, 4] as const + } + if (props.value) { + if (props.value.length > props.name.length) { + return [4, 8, undefined] as const + } + return [8, 4, undefined] as const + } + return [12, undefined, undefined] as const + } + + const sizes = getSizes() + + const renderSpan = function (content: string, className?: string) { + if (props.dangerouslyAllowHtml) { + return ( + + ) + } + return {content} + } + + return ( + + + + + {renderSpan(props.name, props.classes?.name)} + + {props.description && ( + {renderSpan(props.description)} + )} + + + {props.value && ( + + {renderSpan(props.value)} + + )} + {props.extra && ( + + {renderSpan(props.extra)} + + )} + + ) +} diff --git a/tb_plugins/profiling/tb_plugin/fe/src/components/TooltipDescriptions.ts b/tb_plugins/profiling/tb_plugin/fe/src/components/TooltipDescriptions.ts new file mode 100644 index 0000000000000000000000000000000000000000..4fd0d92d2705ca7174b55e41d005438ca361584d --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/components/TooltipDescriptions.ts @@ -0,0 +1,32 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +export const StepTimeBreakDownTooltip = `The time spent on each step is broken down into multiple categories as follows: +Kernel: Kernels execution time on GPU device; +Memcpy: GPU involved memory copy time (either D2D, D2H or H2D); +Memset: GPU involved memory set time; +Runtime: CUDA runtime execution time on host side; Such as cudaLaunchKernel, cudaMemcpyAsync, cudaStreamSynchronize, ... +DataLoader: The data loading time spent in PyTorch DataLoader object; +CPU Exec: Host compute time, including every PyTorch operator running time; +Other: The time not included in any of the above.` + +export const DeviceSelfTimeTooltip = `The accumulated time spent on GPU, not including this operator’s child operators.` + +export const DeviceTotalTimeTooltip = `The accumulated time spent on GPU, including this operator’s child operators.` + +export const HostSelfTimeTooltip = `The accumulated time spent on Host, not including this operator’s child operators.` + +export const HostTotalTimeTooltip = `The accumulated time spent on Host, including this operator’s child operators.` + +export const GPUKernelTotalTimeTooltip = `The accumulated time of all calls of this kernel.` + +export const TensorCoresPieChartTooltip = `The accumulated time of all kernels using or not using Tensor Cores.` + +export const DistributedGpuInfoTableTooltip = `Information about GPU hardware used during the run.` + +export const DistributedOverlapGraphTooltip = `The time spent on computation vs communication.` + +export const DistributedWaittimeGraphTooltip = `The time spent waiting vs communicating between devices.` + +export const DistributedCommopsTableTooltip = `Statistics for operations managing communications between nodes.` diff --git a/tb_plugins/profiling/tb_plugin/fe/src/components/TraceView.tsx b/tb_plugins/profiling/tb_plugin/fe/src/components/TraceView.tsx new file mode 100644 index 0000000000000000000000000000000000000000..8f1f3684305cabfe6f35d341557386c1d8f71cf1 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/components/TraceView.tsx @@ -0,0 +1,86 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +import ClickAwayListener from '@material-ui/core/ClickAwayListener' +import { makeStyles } from '@material-ui/core/styles' +import * as React from 'react' +import * as api from '../api' + +export interface IProps { + run: string + worker: string + span: string + iframeRef: React.RefObject +} + +const useStyles = makeStyles(() => ({ + root: { + flexGrow: 1 + }, + frame: { + width: '100%', + height: 'calc(100vh - 48px)', + border: 'none' + } +})) + +export const TraceView: React.FC = (props) => { + const { run, worker, span, iframeRef } = props + const classes = useStyles() + + const [traceData, setTraceData] = React.useState | null>(null) + const [traceViewReady, setTraceViewReady] = React.useState(false) + + React.useEffect(() => { + setTraceData( + api.defaultApi.traceGet(run, worker, span).then((resp) => { + return JSON.stringify(resp) + }) + ) + }, [run, worker, span]) + + React.useEffect(() => { + function callback(event: MessageEvent) { + const data = event.data || {} + if (data.msg === 'ready') { + setTraceViewReady(true) + } + } + + window.addEventListener('message', callback) + return () => { + window.removeEventListener('message', callback) + } + }, []) + + React.useEffect(() => { + if (traceData && traceViewReady) { + traceData.then((data) => { + iframeRef.current?.contentWindow?.postMessage( + { msg: 'data', data }, + '*' + ) + }) + } + }, [traceData, traceViewReady]) + const SetIframeActive = () => { + iframeRef.current?.focus() + } + return ( +
+ {React.useMemo( + () => ( + + + + ), + [] + )} +
+ ) +} diff --git a/tb_plugins/profiling/tb_plugin/fe/src/components/charts/AntTableChart.tsx b/tb_plugins/profiling/tb_plugin/fe/src/components/charts/AntTableChart.tsx new file mode 100644 index 0000000000000000000000000000000000000000..62da832071f774c27384f70778ea4f5675ea6beb --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/components/charts/AntTableChart.tsx @@ -0,0 +1,110 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +import { makeStyles } from '@material-ui/core/styles' +import { Table } from 'antd' +import * as React from 'react' +import { Graph } from '../../api' + +interface IProps { + graph: Graph + sortColumn?: string + initialPageSize?: number + onRowSelected?: (record?: object, rowIndex?: number) => void +} + +const useStyles = makeStyles((theme) => ({ + tooltip: { + whiteSpace: 'pre-wrap' + }, + row: { + wordBreak: 'break-word' + } +})) + +const getTableColumns = function ( + columns: any, + sort: string | undefined, + tooltipClass: string +) { + let i = 0 + return columns.map(function (col: any) { + const key = 'col' + i++ + const stringCompare = (a: any, b: any) => a[key].localeCompare(b[key]) + const numberCompare = (a: any, b: any) => (a[key] || 0) - (b[key] || 0) + return { + dataIndex: key, + key: key, + title: col.name, + sorter: col.type == 'string' ? stringCompare : numberCompare, + defaultSortOrder: sort == col.name ? ('descend' as const) : undefined, + showSorterTooltip: col.tooltip + ? { title: col.tooltip, overlayClassName: tooltipClass } + : true + } + }) +} + +const getTableRows = function (rows: any) { + return rows.map(function (row: any) { + let i = 0 + const res: any = {} + row.forEach(function (entry: any) { + res['col' + i++] = entry + }) + return res + }) +} + +export const AntTableChart: React.FC = (props) => { + const { graph, sortColumn, initialPageSize, onRowSelected } = props + const classes = useStyles(props) + + const rows = React.useMemo(() => getTableRows(graph.rows), [graph.rows]) + + const columns = React.useMemo( + () => getTableColumns(graph.columns, sortColumn, classes.tooltip), + [graph.columns, sortColumn, classes.tooltip] + ) + + // key is used to reset the Table state (page and sort) if the columns change + const key = React.useMemo(() => Math.random() + '', [graph.columns]) + + const [pageSize, setPageSize] = React.useState(initialPageSize ?? 30) + const onShowSizeChange = (current: number, size: number) => { + setPageSize(size) + } + + const onRow = (record: object, rowIndex?: number) => { + return { + onMouseEnter: (event: any) => { + if (onRowSelected) { + onRowSelected(record, rowIndex) + } + }, + onMouseLeave: (event: any) => { + if (onRowSelected) { + onRowSelected(undefined, undefined) + } + } + } + } + + return ( +
+ ) +} diff --git a/tb_plugins/profiling/tb_plugin/fe/src/components/charts/AreaChart.tsx b/tb_plugins/profiling/tb_plugin/fe/src/components/charts/AreaChart.tsx new file mode 100644 index 0000000000000000000000000000000000000000..6a0f5b484d9c156927edfeae64a729bec821c164 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/components/charts/AreaChart.tsx @@ -0,0 +1,70 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +import { makeStyles } from '@material-ui/core/styles' +import * as React from 'react' +import { Graph } from '../../api' +import { useResizeEventDependency } from '../../utils/resize' + +interface IProps { + graph: Graph + height?: number + hAxisTitle?: string +} + +const useStyles = makeStyles(() => ({ + root: { + height: (props: Pick) => props.height + } +})) + +export const AreaChart: React.FC = (props) => { + const { graph, height = 400, hAxisTitle } = props + const classes = useStyles({ height }) + const graphRef = React.useRef(null) + const [resizeEventDependency] = useResizeEventDependency() + + React.useLayoutEffect(() => { + const element = graphRef.current + if (!element) return + + const data = new google.visualization.DataTable() + data.addColumn('string', 'step') + graph.columns.forEach((column) => { + data.addColumn({ + type: column.type, + label: column.name, + role: column.role, + p: column.p + }) + }) + data.addRows(graph.rows.map((x, i) => [(i + 1).toString(), ...x])) + + const options = { + title: graph.title, + isStacked: true, + height, + legend: { position: 'bottom' }, + tooltip: { isHtml: true }, + chartArea: { left: '15%', width: '80%', top: '10%' }, + hAxis: { + title: hAxisTitle + } + } + + const chart = new google.visualization.AreaChart(element) + + chart.draw(data, options) + + return () => { + chart.clearChart() + } + }, [graph, height, resizeEventDependency]) + + return ( +
+
+
+ ) +} diff --git a/tb_plugins/profiling/tb_plugin/fe/src/components/charts/ColumnChart.tsx b/tb_plugins/profiling/tb_plugin/fe/src/components/charts/ColumnChart.tsx new file mode 100644 index 0000000000000000000000000000000000000000..40d1d1b9b64d0403cb15096c84ae55bbdcfd8a69 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/components/charts/ColumnChart.tsx @@ -0,0 +1,87 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +import { makeStyles } from '@material-ui/core/styles' +import * as React from 'react' +import { Graph } from '../../api' +import { useResizeEventDependency } from '../../utils/resize' + +interface IProps { + title?: string + units?: string + colors?: Array + chartData: ColumnChartData +} + +const useStyles = makeStyles(() => ({ + root: { + height: 500 + } +})) + +export interface ColumnChartData { + legends: Array + barLabels: Array + barHeights: Array> +} + +export const ColumnChart: React.FC = (props) => { + const { title, units, colors, chartData } = props + const { legends, barLabels, barHeights } = chartData + const classes = useStyles() + const graphRef = React.useRef(null) + const [resizeEventDependency] = useResizeEventDependency() + + React.useLayoutEffect(() => { + const element = graphRef.current + if (!element) return + + const data = new google.visualization.DataTable() + data.addColumn({ + type: 'string', + label: 'Worker' + }) + legends.forEach((label) => { + data.addColumn({ + type: 'number', + label + }) + }) + const rows = barHeights.map((heights, i) => + [barLabels[i] as string | number].concat(heights) + ) + data.addRows(rows) + + const options = { + height: 500, + title, + isStacked: true, + legend: { position: 'bottom' }, + vAxis: { + title: units + }, + tooltip: { isHtml: true }, + chartArea: { + left: '15%', + width: '80%', + top: title ? '10%' : '5%' + }, + colors + } + + const chart = new google.visualization.ColumnChart(element) + + chart.draw(data, options) + + return () => { + chart.clearChart() + } + }, [title, chartData, resizeEventDependency]) + + return ( +
+
+
+ ) +} diff --git a/tb_plugins/profiling/tb_plugin/fe/src/components/charts/LineChart.tsx b/tb_plugins/profiling/tb_plugin/fe/src/components/charts/LineChart.tsx new file mode 100644 index 0000000000000000000000000000000000000000..c8958452394254209ae7a9ea69cdb2c5428d7b66 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/components/charts/LineChart.tsx @@ -0,0 +1,134 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +import { makeStyles } from '@material-ui/core/styles' +import * as React from 'react' +import { Graph } from '../../api' +import { useResizeEventDependency } from '../../utils/resize' +import { binarySearch } from '../../utils/binarysearch' + +interface IProps { + graph: Graph + height?: number + hAxisTitle?: string + vAxisTitle?: string + explorerOptions?: object + onSelectionChanged?: (start: number, end: number) => void + record?: any +} + +const useStyles = makeStyles(() => ({ + root: { + height: (props: Pick) => props.height + } +})) + +export const LineChart: React.FC = (props) => { + const { + graph, + height = 400, + hAxisTitle, + vAxisTitle, + onSelectionChanged, + explorerOptions, + record + } = props + const classes = useStyles({ height }) + const graphRef = React.useRef(null) + const [resizeEventDependency] = useResizeEventDependency() + const [chartObj, setChartObj] = React.useState() + + React.useLayoutEffect(() => { + const element = graphRef.current + if (!element) return + + const data = new google.visualization.DataTable() + graph.columns.forEach((column) => { + data.addColumn({ + type: column.type, + label: column.name, + role: column.role, + p: column.p + }) + }) + data.addRows(graph.rows) + + const options = { + title: graph.title, + isStacked: true, + height, + legend: { position: 'bottom' }, + tooltip: { isHtml: true }, + hAxis: { + title: hAxisTitle + }, + vAxis: { + title: vAxisTitle + }, + explorer: explorerOptions + } + + const chart = new google.visualization.LineChart(element) + + // Disable selection of single point + google.visualization.events.addListener(chart, 'select', function () { + chart.setSelection() + }) + + google.visualization.events.addListener(chart, 'ready', function () { + var zoomLast = getCoords() + var observer = new MutationObserver(function () { + var zoomCurrent = getCoords() + if (JSON.stringify(zoomLast) !== JSON.stringify(zoomCurrent)) { + zoomLast = getCoords() + if (onSelectionChanged) { + onSelectionChanged(zoomLast.x_min, zoomLast.x_max) + } + } + }) + if (graphRef.current) { + observer.observe(graphRef.current, { + childList: true, + subtree: true + }) + } + }) + + function getCoords() { + var chartLayout = chart.getChartLayoutInterface() + var chartBounds = chartLayout.getChartAreaBoundingBox() + + return { + x_min: chartLayout.getHAxisValue(chartBounds.left), + x_max: chartLayout.getHAxisValue(chartBounds.width + chartBounds.left) + } + } + + chart.draw(data, options) + setChartObj(chart) + }, [graph, height, resizeEventDependency]) + + React.useEffect(() => { + const compare_fn = (key: number, mid: Array) => + key - parseFloat(mid[0].toFixed(2)) + if (chartObj) { + if (record) { + let startId = binarySearch(graph.rows, record.col2, compare_fn) + let endId = binarySearch(graph.rows, record.col3, compare_fn) + let selection = [] + if (startId >= 0) selection.push({ row: startId, column: 1 }) + if (endId >= 0) selection.push({ row: endId, column: 1 }) + chartObj.setSelection(selection) + } else { + chartObj.setSelection() + } + } + }, [graph, record, chartObj]) + + return ( +
+
+
+ ) +} diff --git a/tb_plugins/profiling/tb_plugin/fe/src/components/charts/PieChart.tsx b/tb_plugins/profiling/tb_plugin/fe/src/components/charts/PieChart.tsx new file mode 100644 index 0000000000000000000000000000000000000000..adae3cb9e540db6a8dd60a61b7dc7b9397ee23c6 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/components/charts/PieChart.tsx @@ -0,0 +1,106 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +import { makeStyles } from '@material-ui/core/styles' +import * as React from 'react' +import { Graph } from '../../api' +import { value } from '../../utils' +import { useResizeEventDependency } from '../../utils/resize' + +interface IProps { + graph: Graph + height?: number + top?: number + noLegend?: boolean + title?: string + colors?: Array + tooltip_mode?: string +} + +const useStyles = makeStyles(() => ({ + root: { + height: (props: IProps) => props.height ?? 300 + } +})) + +const noLegendArea = { left: '5%', width: '90%', top: '5%', height: '90%' } +const normalArea = { left: '5%', width: '95%' } +const noTitleArea = { left: '5%', width: '95%', top: '10%', height: '80%' } + +export const PieChart: React.FC = (props) => { + const { + graph, + height = 300, + top, + noLegend, + title, + colors, + tooltip_mode = 'both' + } = props + const classes = useStyles(props) + const graphRef = React.useRef(null) + + const [resizeEventDependency] = useResizeEventDependency() + + React.useLayoutEffect(() => { + const element = graphRef.current + if (!element) return + + const data = new google.visualization.DataTable() + graph.columns.forEach((column) => { + data.addColumn({ + type: column.type, + label: column.name, + role: column.role, + p: column.p + }) + }) + + const rows = + top === undefined + ? graph.rows + : graph.rows + .sort((a, b) => (value(b[1]) as number) - (value(a[1]) as number)) + .slice(0, top) + data.addRows(rows) + + const options = { + height, + width: '100%', + title, + pieHole: 0.4, + tooltip: { trigger: 'selection', isHtml: true, text: tooltip_mode }, + chartArea: noLegend ? noLegendArea : !title ? noTitleArea : normalArea, + legend: noLegend ? 'none' : undefined, + sliceVisibilityThreshold: 0, + colors + } + + const chart = new google.visualization.PieChart(element) + + google.visualization.events.addListener( + chart, + 'onmouseover', + function (entry: any) { + chart.setSelection([{ row: entry.row }]) + } + ) + + google.visualization.events.addListener(chart, 'onmouseout', function () { + chart.setSelection([]) + }) + + chart.draw(data, options) + + return () => { + chart.clearChart() + } + }, [graph, height, top, resizeEventDependency]) + + return ( +
+
+
+ ) +} diff --git a/tb_plugins/profiling/tb_plugin/fe/src/components/charts/SteppedAreaChart.tsx b/tb_plugins/profiling/tb_plugin/fe/src/components/charts/SteppedAreaChart.tsx new file mode 100644 index 0000000000000000000000000000000000000000..6d2647878bb7623f9739c3106d516514e9d87238 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/components/charts/SteppedAreaChart.tsx @@ -0,0 +1,75 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +import { makeStyles } from '@material-ui/core/styles' +import * as React from 'react' +import { Graph } from '../../api' +import { useResizeEventDependency } from '../../utils/resize' + +interface IProps { + graph: Graph + height?: number + hAxisTitle?: string + vAxisTitle?: string +} + +const useStyles = makeStyles(() => ({ + root: { + height: (props: Pick) => props.height + } +})) + +export const SteppedAreaChart: React.FC = (props) => { + const { graph, height = 400, hAxisTitle, vAxisTitle } = props + const classes = useStyles({ height }) + const graphRef = React.useRef(null) + const [resizeEventDependency] = useResizeEventDependency() + + React.useLayoutEffect(() => { + const element = graphRef.current + if (!element) return + + const data = new google.visualization.DataTable() + graph.columns.forEach((column) => { + data.addColumn({ + type: column.type, + label: column.name, + role: column.role, + p: column.p + }) + }) + data.addRows(graph.rows) + + const options = { + title: graph.title, + isStacked: true, + height, + legend: { position: 'bottom' }, + chartArea: { left: '15%', width: '80%', top: '10%' }, + connectSteps: false, + areaOpacity: 0.9, + tooltip: { isHtml: true }, + hAxis: { + title: hAxisTitle + }, + vAxis: { + title: vAxisTitle + } + } + + const chart = new google.visualization.SteppedAreaChart(element) + + chart.draw(data, options) + + return () => { + chart.clearChart() + } + }, [graph, height, resizeEventDependency]) + + return ( +
+
+
+ ) +} diff --git a/tb_plugins/profiling/tb_plugin/fe/src/components/charts/TableChart.tsx b/tb_plugins/profiling/tb_plugin/fe/src/components/charts/TableChart.tsx new file mode 100644 index 0000000000000000000000000000000000000000..267624c85e02e30e047ff50e7d126259b765c83e --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/components/charts/TableChart.tsx @@ -0,0 +1,87 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +import { makeStyles } from '@material-ui/core/styles' +import * as React from 'react' +import { Graph } from '../../api' +import { useResizeEventDependency } from '../../utils/resize' + +interface IProps { + graph: Graph + sortColumn?: number + height?: number + allowHtml?: boolean + setCellProperty?: ( + row: number, + column: number, + cb: (key: string, value: any) => void + ) => void +} + +const useStyles = makeStyles(() => ({ + root: { + height: (props: IProps) => props.height + } +})) + +export const TableChart: React.FC = (props) => { + const { graph, sortColumn, setCellProperty, allowHtml } = props + const classes = useStyles(props) + const graphRef = React.useRef(null) + const [resizeEventDependency] = useResizeEventDependency() + + React.useLayoutEffect(() => { + const element = graphRef.current + if (!element) return + + const data = new google.visualization.DataTable() + graph.columns.forEach((column) => { + data.addColumn({ + type: column.type, + label: column.name, + role: column.role, + p: column.p + }) + }) + data.addRows(graph.rows) + + if (setCellProperty) { + for (let row = 0; row < graph.rows.length; ++row) { + for (let column = 0; column < graph.columns.length; ++column) { + setCellProperty(row, column, (key: string, value: any) => { + data.setProperty(row, column, key, value) + }) + } + } + } + + const options = { + width: '100%', + height: '100%', + page: 'enable', + allowHtml, + pageSize: 30, + tooltip: { isHtml: true }, + sortColumn: sortColumn, + sortAscending: false + } + + const chart = new google.visualization.Table(element) + + /* `chart.draw()` removes the contents of `element` and rebuilds it. This can cause a jump in the scroll position + * if the height/width change to 0. Since we can't change the code of Google Charts, we temporarily lock the dims + * of the parent container. */ + if (element.offsetHeight > 0) { + element.parentElement!.style.height = element.offsetHeight + 'px' + } + chart.draw(data, options) + element.parentElement!.style.height = '' + }, [graph, resizeEventDependency]) + + return ( +
+
+
+ ) +} diff --git a/tb_plugins/profiling/tb_plugin/fe/src/components/helpers.tsx b/tb_plugins/profiling/tb_plugin/fe/src/components/helpers.tsx new file mode 100644 index 0000000000000000000000000000000000000000..b787a5e91976a7f8f5839978276b35cf2a900cab --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/components/helpers.tsx @@ -0,0 +1,49 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +import { makeStyles } from '@material-ui/core/styles' +import Tooltip from '@material-ui/core/Tooltip' +import HelpOutline from '@material-ui/icons/HelpOutline' +import clsx from 'clsx' +import * as React from 'react' + +export const useTooltipCommonStyles = makeStyles((theme) => ({ + tooltip: { + maxWidth: '600px', + whiteSpace: 'pre-wrap', + fontSize: '14px' + }, + cardTitle: { + display: 'flex', + alignItems: 'center' + }, + titleText: { + marginRight: theme.spacing(0.5) + }, + smallTitleText: { + fontSize: '.8rem', + fontWeight: 'bold' + } +})) + +export const makeChartHeaderRenderer = ( + classes: ReturnType, + smallTitleText = true +) => (title: string, tooltip: string) => { + return ( + + + {title} + + + + + + ) +} diff --git a/tb_plugins/profiling/tb_plugin/fe/src/components/tables/CallFrameList.tsx b/tb_plugins/profiling/tb_plugin/fe/src/components/tables/CallFrameList.tsx new file mode 100644 index 0000000000000000000000000000000000000000..1e2a385bb634b3988142ada0d947adbb46c99715 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/components/tables/CallFrameList.tsx @@ -0,0 +1,42 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +import * as React from 'react' +import { CallStackFrame } from './transform' +import { List } from 'antd' +import { NavToCodeButton } from './NavToCodeButton' +import { makeStyles } from '@material-ui/core/styles' + +interface IProps { + callFrames: CallStackFrame[] +} + +const useStyles = makeStyles(() => ({ + item: { + paddingTop: '1px !important', + paddingBottom: '1px !important' + } +})) + +export const CallFrameList = (props: IProps) => { + const classes = useStyles() + + const renderItem = React.useCallback( + (item: CallStackFrame) => ( + + + + ), + [classes.item] + ) + + return ( + + ) +} diff --git a/tb_plugins/profiling/tb_plugin/fe/src/components/tables/CallStackTable.tsx b/tb_plugins/profiling/tb_plugin/fe/src/components/tables/CallStackTable.tsx new file mode 100644 index 0000000000000000000000000000000000000000..9bf085bbfb23cbc2e8d6a683e528111ac2e00121 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/components/tables/CallStackTable.tsx @@ -0,0 +1,95 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +import * as React from 'react' +import { makeStyles } from '@material-ui/core/styles' +import { CallStackTableData, OperationTableDataInner } from '../../api' +import { Table, TableProps } from 'antd' + +import * as api from '../../api' +import { transformTableData, TransformedCallStackDataInner } from './transform' +import { attachId, getCommonOperationColumns } from './common' +import { OperationGroupBy } from '../../constants/groupBy' +import { makeExpandIcon } from './ExpandIcon' +import { CallFrameList } from './CallFrameList' + +export interface IProps { + data: OperationTableDataInner + run: string + worker: string + span: string + groupBy: OperationGroupBy +} + +const useStyles = makeStyles((theme) => ({ + tooltip: { + whiteSpace: 'pre-wrap' + } +})) + +const expandIcon = makeExpandIcon( + 'View call frames', + (record) => !record.callStackFrames.length +) + +const rowExpandable = (record: TransformedCallStackDataInner) => + !!record.callStackFrames.length +const expandedRowRender = (record: TransformedCallStackDataInner) => ( + +) + +export const CallStackTable = (props: IProps) => { + const { data, run, worker, span, groupBy } = props + const { name, input_shape } = data + const classes = useStyles(props) + + const [stackData, setStackData] = React.useState< + CallStackTableData | undefined + >(undefined) + const [tooltips, setTooltips] = React.useState() + + React.useEffect(() => { + api.defaultApi + .operationStackGet(run, worker, span, groupBy, name, input_shape) + .then((resp) => { + setTooltips(resp.metadata.tooltips) + setStackData(resp.data) + }) + }, [name, input_shape, run, worker, span, groupBy]) + + const transformedData = React.useMemo( + () => stackData && transformTableData(attachId(stackData)), + [stackData] + ) + + const columns = React.useMemo( + () => + transformedData && + getCommonOperationColumns(transformedData, undefined, tooltips, classes), + [transformedData] + ) + + const expandIconColumnIndex = columns?.length + + const expandable: TableProps['expandable'] = React.useMemo( + () => ({ + expandIconColumnIndex, + expandIcon, + expandedRowRender, + rowExpandable + }), + [expandIconColumnIndex] + ) + + return ( +
+ ) +} diff --git a/tb_plugins/profiling/tb_plugin/fe/src/components/tables/ExpandIcon.tsx b/tb_plugins/profiling/tb_plugin/fe/src/components/tables/ExpandIcon.tsx new file mode 100644 index 0000000000000000000000000000000000000000..68ff482827679d9c51c1ca0178b256dc5ae39581 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/components/tables/ExpandIcon.tsx @@ -0,0 +1,34 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +import * as React from 'react' +import { Button, TableProps } from 'antd' +import { OperationTableDataInner, CallStackTableDataInner } from '../../api' +import { Arguments } from '../../utils/type' + +type Types = NonNullable['expandable']>['expandIcon'] +type BasePropType = Arguments>>[0] +type PropType = BasePropType & { text: string; disabled?: boolean } + +export function ExpandIcon< + T extends OperationTableDataInner | CallStackTableDataInner +>(props: PropType) { + const onClick = (e: React.MouseEvent) => { + props.onExpand(props.record, e) + } + + return ( + + ) +} + +export function makeExpandIcon< + T extends OperationTableDataInner | CallStackTableDataInner +>(text: string, disabled?: (v: T) => boolean) { + return (props: BasePropType) => ( + + ) +} diff --git a/tb_plugins/profiling/tb_plugin/fe/src/components/tables/MemoryStatsTable.tsx b/tb_plugins/profiling/tb_plugin/fe/src/components/tables/MemoryStatsTable.tsx new file mode 100644 index 0000000000000000000000000000000000000000..0b33ab4167ba11e9bb610d7ebc0717def2addda2 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/components/tables/MemoryStatsTable.tsx @@ -0,0 +1,85 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +import * as React from 'react' +import { Table } from 'antd' +import { makeStyles } from '@material-ui/core' + +export interface IProps { + data: any + sort: string +} + +const useStyles = makeStyles((theme) => ({ + tooltip: { + whiteSpace: 'pre-wrap' + } +})) + +const getMemoryStatsTableColumns = function ( + columns: any, + sort: string, + tooltipClass: string +) { + let i = 0 + return columns.map(function (col: any) { + const key = 'col' + i++ + const stringCompare = (a: any, b: any) => a[key].localeCompare(b[key]) + const numberCompare = (a: any, b: any) => (a[key] || 0) - (b[key] || 0) + return { + dataIndex: key, + key: key, + title: col.name, + sorter: col.type == 'string' ? stringCompare : numberCompare, + defaultSortOrder: sort == col.name ? ('descend' as const) : undefined, + showSorterTooltip: col.tooltip + ? { title: col.tooltip, overlayClassName: tooltipClass } + : true + } + }) +} + +const getMemoryStatsTableRows = function (rows: any) { + return rows.map(function (row: any) { + let i = 0 + const res: any = {} + row.forEach(function (entry: any) { + res['col' + i++] = entry + }) + return res + }) +} + +export const MemoryStatsTable = (props: IProps) => { + const { data, sort } = props + const classes = useStyles() + + const rows = React.useMemo(() => getMemoryStatsTableRows(data.rows), [ + data.rows + ]) + + const columns = React.useMemo( + () => getMemoryStatsTableColumns(data.columns, sort, classes.tooltip), + [data.columns, sort, classes.tooltip] + ) + + const [pageSize, setPageSize] = React.useState(30) + const onShowSizeChange = (current: number, size: number) => { + setPageSize(size) + } + + return ( +
+ ) +} diff --git a/tb_plugins/profiling/tb_plugin/fe/src/components/tables/NavToCodeButton.tsx b/tb_plugins/profiling/tb_plugin/fe/src/components/tables/NavToCodeButton.tsx new file mode 100644 index 0000000000000000000000000000000000000000..fb40e7f38bf5ccbe89851b5fe2d0b684af71239a --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/components/tables/NavToCodeButton.tsx @@ -0,0 +1,29 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +import * as React from 'react' +import { CallStackFrame } from './transform' +import { Button } from 'antd' +import { navToCode } from '../../utils/vscode' + +interface IProps { + frame: CallStackFrame +} + +export const NavToCodeButton = (props: IProps) => { + const { raw, line, file } = props.frame + const couldNavToFile = line && file + + const onClick = () => { + if (line && file) { + navToCode(file, line - 1) + } + } + + return ( + + ) +} diff --git a/tb_plugins/profiling/tb_plugin/fe/src/components/tables/OperationTable.tsx b/tb_plugins/profiling/tb_plugin/fe/src/components/tables/OperationTable.tsx new file mode 100644 index 0000000000000000000000000000000000000000..65693b200a8671ffbcbfe23494394c4d663bb045 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/components/tables/OperationTable.tsx @@ -0,0 +1,93 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +import * as React from 'react' +import { makeStyles } from '@material-ui/core/styles' +import { + OperationTableData, + OperationTableDataInner, + TableMetadata +} from '../../api' +import { OperationGroupBy } from '../../constants/groupBy' +import { attachId, getCommonOperationColumns } from './common' +import { Table, TablePaginationConfig, TableProps } from 'antd' +import { makeExpandIcon } from './ExpandIcon' +import { CallStackTable } from './CallStackTable' + +export interface IProps { + data: OperationTableData + run: string + worker: string + span: string + groupBy: OperationGroupBy + sortColumn: string + tooltips?: any +} + +const useStyles = makeStyles((theme) => ({ + tooltip: { + whiteSpace: 'pre-wrap' + } +})) + +const rowExpandable = (record: OperationTableDataInner) => record.has_call_stack +const expandIcon = makeExpandIcon( + 'View CallStack', + (record) => !record.has_call_stack +) +export const OperationTable = (props: IProps) => { + const { data, run, worker, span, groupBy, sortColumn, tooltips } = props + const classes = useStyles(props) + + const rows = React.useMemo(() => attachId(data), [data]) + + const columns = React.useMemo( + () => getCommonOperationColumns(rows, sortColumn, tooltips, classes), + [rows] + ) + + const [pageSize, setPageSize] = React.useState(30) + const onShowSizeChange = (current: number, size: number) => { + setPageSize(size) + } + + const expandIconColumnIndex = columns.length + const expandedRowRender = React.useCallback( + (record: OperationTableDataInner) => ( + + ), + [run, worker, span, groupBy] + ) + + const expandable: TableProps['expandable'] = React.useMemo( + () => ({ + expandIconColumnIndex, + expandIcon, + expandedRowRender, + rowExpandable + }), + [expandIconColumnIndex, expandedRowRender] + ) + + return ( +
+ ) +} diff --git a/tb_plugins/profiling/tb_plugin/fe/src/components/tables/common.tsx b/tb_plugins/profiling/tb_plugin/fe/src/components/tables/common.tsx new file mode 100644 index 0000000000000000000000000000000000000000..1b2e0e413f3298960ad066437b57bca2ca46c680 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/components/tables/common.tsx @@ -0,0 +1,143 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +import { firstOrUndefined, isDef } from '../../utils/def' +import { CallStackTableDataInner, OperationTableDataInner } from '../../api' +import type { ColumnsType } from 'antd/es/table' +import { ClassNameMap } from '@material-ui/styles' + +export function getCommonOperationColumns< + T extends OperationTableDataInner | CallStackTableDataInner +>( + data: T[] | undefined, + defaultSort?: string, + tooltips?: any, + classes?: ClassNameMap<'tooltip'> +): ColumnsType { + const firstData = firstOrUndefined(data) + + const hasInputShape = !firstData || isDef(firstData.input_shape) + const hasDeviceSelfDuration = + !firstData || isDef(firstData.device_self_duration) + const hasDeviceTotalDuration = + !firstData || isDef(firstData.device_total_duration) + const hasTcEligible = !firstData || isDef(firstData.tc_eligible) + const hasTcSelfRatio = !firstData || isDef(firstData.tc_self_ratio) + const hasTcTotalRatio = !firstData || isDef(firstData.tc_total_ratio) + + const nameCompare = (a: T, b: T) => a.name.localeCompare(b.name) + const callsCompare = (a: T, b: T) => a.calls - b.calls + const deviceSelfDurationCompare = (a: T, b: T) => + (a.device_self_duration || 0) - (b.device_self_duration || 0) + const deviceTotalDurationCompare = (a: T, b: T) => + (a.device_total_duration || 0) - (b.device_total_duration || 0) + const hostSelfDurationCompare = (a: T, b: T) => + (a.host_self_duration || 0) - (b.host_self_duration || 0) + const hostTotalDurationCompare = (a: T, b: T) => + (a.host_total_duration || 0) - (b.host_total_duration || 0) + const tcEligibleCompare = (a: T, b: T) => + a.tc_eligible!.localeCompare(b.tc_eligible!) + const tcSelfRatioCompare = (a: T, b: T) => + (a.tc_self_ratio || 0) - (b.tc_self_ratio || 0) + const tcTotalRatioCompare = (a: T, b: T) => + (a.tc_total_ratio || 0) - (b.tc_total_ratio || 0) + + const columns: ColumnsType = [ + { + dataIndex: 'name', + key: 'name', + title: 'Name', + sorter: nameCompare + }, + hasInputShape + ? { + dataIndex: 'input_shape', + key: 'input_shape', + title: 'Input Shape' + } + : undefined, + { + dataIndex: 'calls', + sorter: callsCompare, + key: 'calls', + title: 'Calls' + }, + hasDeviceSelfDuration + ? { + dataIndex: 'device_self_duration', + key: 'device_self_duration', + title: 'Device Self Duration (us)', + sorter: deviceSelfDurationCompare, + // Use device_self_duration as default sort if defaultSort is unspecified + defaultSortOrder: defaultSort ? undefined : ('descend' as const) + } + : undefined, + hasDeviceTotalDuration + ? { + dataIndex: 'device_total_duration', + key: 'device_total_duration', + title: 'Device Total Duration (us)', + sorter: deviceTotalDurationCompare + } + : undefined, + { + dataIndex: 'host_self_duration', + key: 'host_self_duration', + title: 'Host Self Duration (us)', + sorter: hostSelfDurationCompare + }, + { + dataIndex: 'host_total_duration', + key: 'host_total_duration', + title: 'Host Total Duration (us)', + sorter: hostTotalDurationCompare + }, + hasTcEligible + ? { + dataIndex: 'tc_eligible', + key: 'tc_eligible', + title: 'Tensor Cores Eligible', + sorter: tcEligibleCompare + } + : undefined, + hasTcSelfRatio + ? { + dataIndex: 'tc_self_ratio', + key: 'tc_self_ratio', + title: 'Tensor Cores Self(%)', + sorter: tcSelfRatioCompare + } + : undefined, + hasTcTotalRatio + ? { + dataIndex: 'tc_total_ratio', + key: 'tc_total_ratio', + title: 'Tensor Cores Total(%)', + sorter: tcTotalRatioCompare + } + : undefined + ].filter(isDef) + columns.forEach((column) => { + if (column.key == defaultSort) { + column.defaultSortOrder = 'descend' as const + } + if (tooltips[column.key as string]) { + column.showSorterTooltip = { + title: tooltips[column.key as string], + overlayClassName: classes?.tooltip + } + } + }) + return columns +} + +let uid = 1 +export function attachId< + T extends CallStackTableDataInner | OperationTableDataInner +>(data: T[]): T[] { + return data.map((d) => ({ + ...d, + key: uid++ + })) +} diff --git a/tb_plugins/profiling/tb_plugin/fe/src/components/tables/transform.ts b/tb_plugins/profiling/tb_plugin/fe/src/components/tables/transform.ts new file mode 100644 index 0000000000000000000000000000000000000000..bd051fd429d5cb26a44a59b60f776b207a861d64 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/components/tables/transform.ts @@ -0,0 +1,63 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +import { CallStackTableData, CallStackTableDataInner } from '../../api' + +export interface CallStackFrame { + file?: string + line?: number + raw: string +} + +export interface TransformedCallStackDataInner extends CallStackTableDataInner { + callStackFrames: CallStackFrame[] +} + +const lineRegex = /\([0-9]+\)$/ + +function parseCallStackLine(raw: string): CallStackFrame { + raw = raw.trim() + const results = raw.split(':') + const location = results.slice(0, results.length - 1).join(':') + + const result = lineRegex.exec(location) + if (!result) { + return { raw } + } + + const lineWithParens = result[0].trim() + const file = raw.slice(0, result.index).trim() + const line = Number( + lineWithParens.substr(1, lineWithParens.length - 2).trim() + ) + + return { + raw, + file, + line + } +} + +function parseCallStack(callStack: string | undefined): CallStackFrame[] { + const lines = (callStack ?? '') + .trim() + .split(';') + .map((x) => x.trim()) + return lines.map(parseCallStackLine) +} + +function transformCallStackData( + data: CallStackTableDataInner +): TransformedCallStackDataInner { + return { + ...data, + callStackFrames: parseCallStack(data.call_stack) + } +} + +export function transformTableData( + data: CallStackTableData +): TransformedCallStackDataInner[] { + return data.map(transformCallStackData) +} diff --git a/tb_plugins/profiling/tb_plugin/fe/src/components/transform.ts b/tb_plugins/profiling/tb_plugin/fe/src/components/transform.ts new file mode 100644 index 0000000000000000000000000000000000000000..08dcb25a20daf1868cc4ff2ea6245f444330b93f --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/components/transform.ts @@ -0,0 +1,82 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +import * as api from '../api' +import { assertDef, isDef } from '../utils/def' + +export function transformPerformanceIntoTable( + performances: api.Performance[] +): api.Graph { + const columns: api.GraphColumn[] = [ + { type: 'string', name: 'Category' }, + { type: 'number', name: 'Time Duration (us)' }, + { type: 'number', name: 'Percentage (%)' } + ] + + const rows: api.Graph['rows'] = [] + const queue = [...performances] + + while (queue.length) { + const first = queue.shift() + assertDef(first) + + const row: api.Graph['rows'][number] = [] + const { name, value, extra, children } = first + assertDef(value) + assertDef(extra) + + row.push(name) + row.push(value) + row.push(extra) + + if (isDef(children) && children.length) { + queue.push(...children) + } + + rows.push(row) + } + + return { + columns, + rows + } +} + +export function transformPerformanceIntoPie(performances: api.Performance[]) { + const columns: api.GraphColumn[] = [ + { type: 'string', name: 'Name' }, + { type: 'number', name: 'Value' } + ] + + const rows: api.Graph['rows'] = [] + const queue: api.Performance[] = [] + performances.forEach((topLevel) => { + if (topLevel.children) { + queue.push(...topLevel.children) + } + }) + + while (queue.length) { + const first = queue.shift() + assertDef(first) + + const row: api.Graph['rows'][number] = [] + const { name, value, children } = first + assertDef(value) + + row.push(name) + row.push(Number.parseInt(value, 10)) + + if (isDef(children) && children.length) { + queue.push(...children) + } + + rows.push(row) + } + + return { + columns, + rows + } +} diff --git a/tb_plugins/profiling/tb_plugin/fe/src/constants/groupBy.ts b/tb_plugins/profiling/tb_plugin/fe/src/constants/groupBy.ts new file mode 100644 index 0000000000000000000000000000000000000000..2b96c6b8dd3a0f1127f2617b72934d65c89f01f0 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/constants/groupBy.ts @@ -0,0 +1,13 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +export enum OperationGroupBy { + Operation = 'Operation', + OperationAndInputShape = 'OperationAndInputShape' +} + +export enum KernelGroupBy { + Kernel = 'Kernel', + KernelNameAndOpName = 'KernelNameAndOpName' +} diff --git a/tb_plugins/profiling/tb_plugin/fe/src/gstatic.d.ts b/tb_plugins/profiling/tb_plugin/fe/src/gstatic.d.ts new file mode 100644 index 0000000000000000000000000000000000000000..646255c2cdc20595fc0166b8cd5ce4743549bd2c --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/gstatic.d.ts @@ -0,0 +1,6 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +declare const google: any +declare module 'react-flame-graph' diff --git a/tb_plugins/profiling/tb_plugin/fe/src/index.tsx b/tb_plugins/profiling/tb_plugin/fe/src/index.tsx new file mode 100644 index 0000000000000000000000000000000000000000..8e2104cc831de871ec325e7efebf59c70966a74d --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/index.tsx @@ -0,0 +1,9 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +import * as React from 'react' +import { render } from 'react-dom' +import { App } from './app' + +render(, document.getElementById('app')) diff --git a/tb_plugins/profiling/tb_plugin/fe/src/setup.tsx b/tb_plugins/profiling/tb_plugin/fe/src/setup.tsx new file mode 100644 index 0000000000000000000000000000000000000000..5db44e8243119c7988ef33007e2eb3134fe6e857 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/setup.tsx @@ -0,0 +1,9 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +export async function setup() { + await google.charts.load('current', { + packages: ['corechart', 'table', 'timeline'] + }) +} diff --git a/tb_plugins/profiling/tb_plugin/fe/src/styles.css b/tb_plugins/profiling/tb_plugin/fe/src/styles.css new file mode 100644 index 0000000000000000000000000000000000000000..bea7de6770e6fb512f2fd2f3a5120a944045efa3 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/styles.css @@ -0,0 +1,13 @@ +.goog-tooltip { + display: none !important; +} + +.visualization-tooltip { + padding: 4px 10px; + white-space: nowrap; +} + +div.google-visualization-tooltip { + pointer-events: none; + max-width: 90%; +} diff --git a/tb_plugins/profiling/tb_plugin/fe/src/utils/binarysearch.ts b/tb_plugins/profiling/tb_plugin/fe/src/utils/binarysearch.ts new file mode 100644 index 0000000000000000000000000000000000000000..0477cac74d0b0d6836b53f18689891feb2f10cea --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/utils/binarysearch.ts @@ -0,0 +1,20 @@ +export function binarySearch( + arr: Array, + key: any, + compare_fn: Function +): number { + let low = 0, + high = arr.length - 1 + while (low <= high) { + let mid = Math.round((high + low) / 2) + let cmp = compare_fn(key, arr[mid]) + if (cmp > 0) { + low = mid + 1 + } else if (cmp < 0) { + high = mid - 1 + } else { + return mid + } + } + return -1 +} diff --git a/tb_plugins/profiling/tb_plugin/fe/src/utils/debounce.ts b/tb_plugins/profiling/tb_plugin/fe/src/utils/debounce.ts new file mode 100644 index 0000000000000000000000000000000000000000..fcd6368e6ac9e971c85267fe5e6ccc9781235c9e --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/utils/debounce.ts @@ -0,0 +1,21 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +import * as React from 'react' + +export function useDebounce(value: T, delay: number): T { + const [debouncedValue, setDebouncedValue] = React.useState(value) + + React.useEffect(() => { + const handler = setTimeout(() => { + setDebouncedValue(value) + }, delay) + + return () => { + clearTimeout(handler) + } + }, [value, delay]) + + return debouncedValue +} diff --git a/tb_plugins/profiling/tb_plugin/fe/src/utils/def.ts b/tb_plugins/profiling/tb_plugin/fe/src/utils/def.ts new file mode 100644 index 0000000000000000000000000000000000000000..c024293a54e18e543c331226c317713f829c5c10 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/utils/def.ts @@ -0,0 +1,18 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +export function isDef(v: T | undefined | null): v is T { + return v !== null && v !== undefined +} + +export function assertDef(v: T | undefined | null): asserts v is T { + if (!isDef(v)) { + throw new Error('Must be defined') + } +} + +export function firstOrUndefined(v: T[] | undefined): T | undefined { + if (!v || !v.length) return undefined + return v[0] +} diff --git a/tb_plugins/profiling/tb_plugin/fe/src/utils/hooks.ts b/tb_plugins/profiling/tb_plugin/fe/src/utils/hooks.ts new file mode 100644 index 0000000000000000000000000000000000000000..d8dd3eff536eb5e22683debe4338e785fe630616 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/utils/hooks.ts @@ -0,0 +1,27 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +import * as React from 'react' + +const cbs: (() => void)[] = [] +export const useOnResize = (cb: () => void) => { + React.useEffect(() => { + if (cbs.length === 0) { + window.addEventListener('resize', () => { + cbs.forEach((cb) => cb()) + }) + } + cbs.push(cb) + + return () => { + const idx = cbs.findIndex(cb) + if (idx > -1) { + cbs.splice(idx, 1) + } + if (cbs.length === 0) { + window.removeEventListener('reset', cb) + } + } + }, [cb]) +} diff --git a/tb_plugins/profiling/tb_plugin/fe/src/utils/index.ts b/tb_plugins/profiling/tb_plugin/fe/src/utils/index.ts new file mode 100644 index 0000000000000000000000000000000000000000..1c7074b4c2002c40dc0b3f2f3da88d9a2b783a5f --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/utils/index.ts @@ -0,0 +1,24 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +import { ValueAndFormat } from '../api' + +export function firstOrUndefined(v: T[] | undefined | null): T | undefined { + if (!v || !v.length) return undefined + return v[0] +} + +export function sleep(delay: number) { + return new Promise((resolve) => setTimeout(resolve, delay)) +} + +export function isValueAndFormat(v: any): v is ValueAndFormat { + return 'f' in v && 'v' in v +} + +export function value( + v: boolean | number | string | ValueAndFormat +): boolean | number | string { + return typeof v === 'object' && isValueAndFormat(v) ? v.v : v +} diff --git a/tb_plugins/profiling/tb_plugin/fe/src/utils/resize.ts b/tb_plugins/profiling/tb_plugin/fe/src/utils/resize.ts new file mode 100644 index 0000000000000000000000000000000000000000..57ab394042651fcddb7a48cfa158647d2e6b9faa --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/utils/resize.ts @@ -0,0 +1,27 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +import * as React from 'react' +import debounce from '@material-ui/core/utils/debounce' + +export function useResizeEventDependency() { + const [version, setVersion] = React.useState(0) + + const increaseVersion = React.useCallback( + debounce(() => { + setVersion((prev) => prev + 1) + }, 100), + [] + ) + + React.useEffect(() => { + window.addEventListener('resize', increaseVersion) + + return () => { + window.removeEventListener('resize', increaseVersion) + } + }, []) + + return [version] as const +} diff --git a/tb_plugins/profiling/tb_plugin/fe/src/utils/search.ts b/tb_plugins/profiling/tb_plugin/fe/src/utils/search.ts new file mode 100644 index 0000000000000000000000000000000000000000..36689758752625b6c249c5fd532d93c9e5fbafb4 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/utils/search.ts @@ -0,0 +1,66 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +import * as React from 'react' +import { value } from '.' +import * as api from '../api' +import { useDebounce } from './debounce' + +export function useSearch( + searchName: string, + columnName: string, + table: api.Graph | undefined +): [api.Graph | undefined] { + const searchNameDebounce = useDebounce(searchName.trim(), 500) + + const searchedTable: api.Graph | undefined = React.useMemo(() => { + if (!searchNameDebounce) { + return table + } + + if (!table) { + return undefined + } + + const columnNameToFind = columnName.toLowerCase() + const nameColumnIdx = table.columns.findIndex( + (c) => c.name.toLowerCase() === columnNameToFind + ) + if (nameColumnIdx < 0) { + return table + } + + return { + ...table, + rows: table.rows.filter((x) => { + const cell = value(x[nameColumnIdx]) + return typeof cell === 'string' && cell.includes(searchNameDebounce) + }) + } + }, [table, searchNameDebounce]) + return [searchedTable] +} + +export function useSearchDirectly( + searchName: string, + field: (v: T) => string, + table: T[] | undefined +): [T[] | undefined] { + const searchNameDebounce = useDebounce(searchName.trim(), 500) + + const result = React.useMemo(() => { + if (!searchNameDebounce) { + return table + } + + if (!table) { + return undefined + } + + return table.filter((row) => { + return field(row).toLowerCase().includes(searchNameDebounce.toLowerCase()) + }) + }, [table, field, searchNameDebounce]) + return [result] +} diff --git a/tb_plugins/profiling/tb_plugin/fe/src/utils/top.ts b/tb_plugins/profiling/tb_plugin/fe/src/utils/top.ts new file mode 100644 index 0000000000000000000000000000000000000000..87bd3c1b86f763a63dbf195ee5feaf649d56e006 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/utils/top.ts @@ -0,0 +1,50 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +import debounce from '@material-ui/core/utils/debounce' +import * as React from 'react' + +export enum UseTop { + NotUse = 'NotUse', + Use = 'Use' +} + +interface IOptions { + defaultTop?: number + defaultUseTop?: UseTop + noDebounce?: boolean + wait?: number +} + +export function useTopN(options?: IOptions) { + options ??= {} + + const [topText, setTopText] = React.useState(String(options.defaultTop ?? 15)) + const [actualTop, setActualTop] = React.useState( + Number(topText) + ) + const [useTop, setUseTop] = React.useState( + options.defaultUseTop ?? UseTop.NotUse + ) + + const setActualDebounce = !options.noDebounce + ? React.useCallback(debounce(setActualTop, options.wait ?? 500), []) + : setActualTop + React.useEffect(() => { + if (useTop !== UseTop.Use) { + setActualDebounce(undefined) + } else if (topIsValid(topText)) { + setActualDebounce(Number(topText)) + } else { + setActualDebounce(actualTop) + } + }, [topText, useTop]) + + return [topText, actualTop, useTop, setTopText, setUseTop] as const +} + +export function topIsValid(topText: string) { + const top = Number(topText) + return !Number.isNaN(top) && top > 0 && Number.isInteger(top) +} diff --git a/tb_plugins/profiling/tb_plugin/fe/src/utils/type.ts b/tb_plugins/profiling/tb_plugin/fe/src/utils/type.ts new file mode 100644 index 0000000000000000000000000000000000000000..fde74bc598b930f26dd8a83157c91953da2c045c --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/utils/type.ts @@ -0,0 +1,9 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +export type Arguments void> = T extends ( + ...args: infer A +) => void + ? A + : never diff --git a/tb_plugins/profiling/tb_plugin/fe/src/utils/vscode.ts b/tb_plugins/profiling/tb_plugin/fe/src/utils/vscode.ts new file mode 100644 index 0000000000000000000000000000000000000000..62f1a90809548691f3b7b7a89d71ac65e4bf622b --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/src/utils/vscode.ts @@ -0,0 +1,13 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + *--------------------------------------------------------------------------------------------*/ + +export function navToCode(filename: string, line: number) { + window.parent.parent.postMessage( + { + filename, + line + }, + '*' + ) +} diff --git a/tb_plugins/profiling/tb_plugin/fe/tsconfig.json b/tb_plugins/profiling/tb_plugin/fe/tsconfig.json new file mode 100644 index 0000000000000000000000000000000000000000..182aafbe127c2c3209eacae7483ec02a2cd622cc --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/tsconfig.json @@ -0,0 +1,18 @@ +{ + "compilerOptions": { + "target": "es5", + "lib": [ + "dom", + "dom.iterable", + "esnext" + ], + "strict": true, + "strictPropertyInitialization": false, + "module": "esnext", + "moduleResolution": "node", + "forceConsistentCasingInFileNames": true, + "rootDir": "src", + "outDir": "dist", + "jsx": "react", + } +} diff --git a/tb_plugins/profiling/tb_plugin/fe/update-static.js b/tb_plugins/profiling/tb_plugin/fe/update-static.js new file mode 100644 index 0000000000000000000000000000000000000000..a4f59f7c7e20991d39e8db3615084fa13d43d98d --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/update-static.js @@ -0,0 +1,9 @@ +const fs = require('fs') +const path = require('path') + +fs.copyFileSync( + path.resolve(__dirname, 'dist/index.html'), + path.resolve(__dirname, '../torch_tb_profiler/static/index.html') +) + +console.log('Copy done.') diff --git a/tb_plugins/profiling/tb_plugin/fe/webpack.config.js b/tb_plugins/profiling/tb_plugin/fe/webpack.config.js new file mode 100644 index 0000000000000000000000000000000000000000..38a635b9a1059269b8a8e2343a18aea1ac1e683a --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/webpack.config.js @@ -0,0 +1,40 @@ +const path = require('path') +const HtmlWebpackPlugin = require('html-webpack-plugin') +const InlineChunkHtmlPlugin = require('inline-chunk-html-plugin') + +const isDev = process.env.NODE_ENV !== 'production' + +/** + * @type {import('webpack').Configuration & import('webpack-dev-server').Configuration} + */ +module.exports = { + mode: isDev ? 'development' : 'production', + entry: './src/index.tsx', + output: { + path: path.resolve(__dirname, 'dist'), + filename: 'index.js' + }, + resolve: { + // Add `.ts` and `.tsx` as a resolvable extension. + extensions: ['.ts', '.tsx', '.js'] + }, + module: { + rules: [ + { test: /\.tsx?$/i, use: 'ts-loader' }, + { test: /\.css$/i, use: ['style-loader', 'css-loader'] } + ] + }, + plugins: [ + new HtmlWebpackPlugin({ + inject: true, + scriptLoading: 'blocking', + template: 'index.html' + }), + !isDev ? new InlineChunkHtmlPlugin(HtmlWebpackPlugin, [/.*/]) : undefined + ].filter(Boolean), + devServer: { + // proxy: { + // '/data/plugin/pytorch_profiler': '' + // } + } +} diff --git a/tb_plugins/profiling/tb_plugin/fe/yarn.lock b/tb_plugins/profiling/tb_plugin/fe/yarn.lock new file mode 100644 index 0000000000000000000000000000000000000000..3e914db864c7654443e9041cfc1899ea2ac30bb1 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/fe/yarn.lock @@ -0,0 +1,3672 @@ +# THIS IS AN AUTOGENERATED FILE. DO NOT EDIT THIS FILE DIRECTLY. +# yarn lockfile v1 + + +"@ant-design/colors@^6.0.0": + version "6.0.0" + resolved "https://registry.yarnpkg.com/@ant-design/colors/-/colors-6.0.0.tgz#9b9366257cffcc47db42b9d0203bb592c13c0298" + integrity sha512-qAZRvPzfdWHtfameEGP2Qvuf838NhergR35o+EuVyB5XvSA98xod5r4utvi4TJ3ywmevm290g9nsCG5MryrdWQ== + dependencies: + "@ctrl/tinycolor" "^3.4.0" + +"@ant-design/icons-svg@^4.2.1": + version "4.2.1" + resolved "https://registry.yarnpkg.com/@ant-design/icons-svg/-/icons-svg-4.2.1.tgz#8630da8eb4471a4aabdaed7d1ff6a97dcb2cf05a" + integrity sha512-EB0iwlKDGpG93hW8f85CTJTs4SvMX7tt5ceupvhALp1IF44SeUFOMhKUOYqpsoYWQKAOuTRDMqn75rEaKDp0Xw== + +"@ant-design/icons@^4.7.0": + version "4.7.0" + resolved "https://registry.yarnpkg.com/@ant-design/icons/-/icons-4.7.0.tgz#8c3cbe0a556ba92af5dc7d1e70c0b25b5179af0f" + integrity sha512-aoB4Z7JA431rt6d4u+8xcNPPCrdufSRMUOpxa1ab6mz1JCQZOEVolj2WVs/tDFmN62zzK30mNelEsprLYsSF3g== + dependencies: + "@ant-design/colors" "^6.0.0" + "@ant-design/icons-svg" "^4.2.1" + "@babel/runtime" "^7.11.2" + classnames "^2.2.6" + rc-util "^5.9.4" + +"@ant-design/react-slick@~0.28.1": + version "0.28.4" + resolved "https://registry.yarnpkg.com/@ant-design/react-slick/-/react-slick-0.28.4.tgz#8b296b87ad7c7ae877f2a527b81b7eebd9dd29a9" + integrity sha512-j9eAHTn7GxbXUFNknJoHS2ceAsqrQi2j8XykjZE1IXCD8kJF+t28EvhBLniDpbOsBk/3kjalnhriTfZcjBHNqg== + dependencies: + "@babel/runtime" "^7.10.4" + classnames "^2.2.5" + json2mq "^0.2.0" + lodash "^4.17.21" + resize-observer-polyfill "^1.5.0" + +"@babel/runtime@^7.0.0", "@babel/runtime@^7.10.1", "@babel/runtime@^7.10.2", "@babel/runtime@^7.10.4", "@babel/runtime@^7.11.1", "@babel/runtime@^7.11.2", "@babel/runtime@^7.12.5", "@babel/runtime@^7.13.10", "@babel/runtime@^7.3.1", "@babel/runtime@^7.4.4", "@babel/runtime@^7.5.5", "@babel/runtime@^7.8.3", "@babel/runtime@^7.8.4", "@babel/runtime@^7.8.7": + version "7.17.2" + resolved "https://registry.yarnpkg.com/@babel/runtime/-/runtime-7.17.2.tgz#66f68591605e59da47523c631416b18508779941" + integrity sha512-hzeyJyMA1YGdJTuWU0e/j4wKXrU4OMFvY2MSlaI9B7VQb0r5cxTE3EAIS2Q7Tn2RIcDkRvTA/v2JsAEhxe99uw== + dependencies: + regenerator-runtime "^0.13.4" + +"@ctrl/tinycolor@^3.4.0": + version "3.4.0" + resolved "https://registry.yarnpkg.com/@ctrl/tinycolor/-/tinycolor-3.4.0.tgz#c3c5ae543c897caa9c2a68630bed355be5f9990f" + integrity sha512-JZButFdZ1+/xAfpguQHoabIXkcqRRKpMrWKBkpEZZyxfY9C1DpADFB8PEqGSTeFr135SaTRfKqGKx5xSCLI7ZQ== + +"@discoveryjs/json-ext@^0.5.0": + version "0.5.6" + resolved "https://registry.yarnpkg.com/@discoveryjs/json-ext/-/json-ext-0.5.6.tgz#d5e0706cf8c6acd8c6032f8d54070af261bbbb2f" + integrity sha512-ws57AidsDvREKrZKYffXddNkyaF14iHNHm8VQnZH6t99E8gczjNN0GpvcGny0imC80yQ0tHz1xVUKk/KFQSUyA== + +"@emotion/hash@^0.8.0": + version "0.8.0" + resolved "https://registry.yarnpkg.com/@emotion/hash/-/hash-0.8.0.tgz#bbbff68978fefdbe68ccb533bc8cbe1d1afb5413" + integrity sha512-kBJtf7PH6aWwZ6fka3zQ0p6SBYzx4fl1LoZXE2RrnYST9Xljm7WfKJrU4g/Xr3Beg72MLrp1AWNUmuYJTL7Cow== + +"@material-ui/core@^4.11.3": + version "4.12.3" + resolved "https://registry.yarnpkg.com/@material-ui/core/-/core-4.12.3.tgz#80d665caf0f1f034e52355c5450c0e38b099d3ca" + integrity sha512-sdpgI/PL56QVsEJldwEe4FFaFTLUqN+rd7sSZiRCdx2E/C7z5yK0y/khAWVBH24tXwto7I1hCzNWfJGZIYJKnw== + dependencies: + "@babel/runtime" "^7.4.4" + "@material-ui/styles" "^4.11.4" + "@material-ui/system" "^4.12.1" + "@material-ui/types" "5.1.0" + "@material-ui/utils" "^4.11.2" + "@types/react-transition-group" "^4.2.0" + clsx "^1.0.4" + hoist-non-react-statics "^3.3.2" + popper.js "1.16.1-lts" + prop-types "^15.7.2" + react-is "^16.8.0 || ^17.0.0" + react-transition-group "^4.4.0" + +"@material-ui/icons@^4.11.2": + version "4.11.2" + resolved "https://registry.yarnpkg.com/@material-ui/icons/-/icons-4.11.2.tgz#b3a7353266519cd743b6461ae9fdfcb1b25eb4c5" + integrity sha512-fQNsKX2TxBmqIGJCSi3tGTO/gZ+eJgWmMJkgDiOfyNaunNaxcklJQFaFogYcFl0qFuaEz1qaXYXboa/bUXVSOQ== + dependencies: + "@babel/runtime" "^7.4.4" + +"@material-ui/styles@^4.11.4": + version "4.11.4" + resolved "https://registry.yarnpkg.com/@material-ui/styles/-/styles-4.11.4.tgz#eb9dfccfcc2d208243d986457dff025497afa00d" + integrity sha512-KNTIZcnj/zprG5LW0Sao7zw+yG3O35pviHzejMdcSGCdWbiO8qzRgOYL8JAxAsWBKOKYwVZxXtHWaB5T2Kvxew== + dependencies: + "@babel/runtime" "^7.4.4" + "@emotion/hash" "^0.8.0" + "@material-ui/types" "5.1.0" + "@material-ui/utils" "^4.11.2" + clsx "^1.0.4" + csstype "^2.5.2" + hoist-non-react-statics "^3.3.2" + jss "^10.5.1" + jss-plugin-camel-case "^10.5.1" + jss-plugin-default-unit "^10.5.1" + jss-plugin-global "^10.5.1" + jss-plugin-nested "^10.5.1" + jss-plugin-props-sort "^10.5.1" + jss-plugin-rule-value-function "^10.5.1" + jss-plugin-vendor-prefixer "^10.5.1" + prop-types "^15.7.2" + +"@material-ui/system@^4.12.1": + version "4.12.1" + resolved "https://registry.yarnpkg.com/@material-ui/system/-/system-4.12.1.tgz#2dd96c243f8c0a331b2bb6d46efd7771a399707c" + integrity sha512-lUdzs4q9kEXZGhbN7BptyiS1rLNHe6kG9o8Y307HCvF4sQxbCgpL2qi+gUk+yI8a2DNk48gISEQxoxpgph0xIw== + dependencies: + "@babel/runtime" "^7.4.4" + "@material-ui/utils" "^4.11.2" + csstype "^2.5.2" + prop-types "^15.7.2" + +"@material-ui/types@5.1.0": + version "5.1.0" + resolved "https://registry.yarnpkg.com/@material-ui/types/-/types-5.1.0.tgz#efa1c7a0b0eaa4c7c87ac0390445f0f88b0d88f2" + integrity sha512-7cqRjrY50b8QzRSYyhSpx4WRw2YuO0KKIGQEVk5J8uoz2BanawykgZGoWEqKm7pVIbzFDN0SpPcVV4IhOFkl8A== + +"@material-ui/utils@^4.11.2": + version "4.11.2" + resolved "https://registry.yarnpkg.com/@material-ui/utils/-/utils-4.11.2.tgz#f1aefa7e7dff2ebcb97d31de51aecab1bb57540a" + integrity sha512-Uul8w38u+PICe2Fg2pDKCaIG7kOyhowZ9vjiC1FsVwPABTW8vPPKfF6OvxRq3IiBaI1faOJmgdvMG7rMJARBhA== + dependencies: + "@babel/runtime" "^7.4.4" + prop-types "^15.7.2" + react-is "^16.8.0 || ^17.0.0" + +"@nodelib/fs.scandir@2.1.5": + version "2.1.5" + resolved "https://registry.yarnpkg.com/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz#7619c2eb21b25483f6d167548b4cfd5a7488c3d5" + integrity sha512-vq24Bq3ym5HEQm2NKCr3yXDwjc7vTsEThRDnkp2DK9p1uqLR+DHurm/NOTo0KG7HYHU7eppKZj3MyqYuMBf62g== + dependencies: + "@nodelib/fs.stat" "2.0.5" + run-parallel "^1.1.9" + +"@nodelib/fs.stat@2.0.5", "@nodelib/fs.stat@^2.0.2": + version "2.0.5" + resolved "https://registry.yarnpkg.com/@nodelib/fs.stat/-/fs.stat-2.0.5.tgz#5bd262af94e9d25bd1e71b05deed44876a222e8b" + integrity sha512-RkhPPp2zrqDAQA/2jNhnztcPAlv64XdhIp7a7454A5ovI7Bukxgt7MX7udwAu3zg1DcpPU0rz3VV1SeaqvY4+A== + +"@nodelib/fs.walk@^1.2.3": + version "1.2.8" + resolved "https://registry.yarnpkg.com/@nodelib/fs.walk/-/fs.walk-1.2.8.tgz#e95737e8bb6746ddedf69c556953494f196fe69a" + integrity sha512-oGB+UxlgWcgQkgwo8GcEGwemoTFt3FIO9ababBmaGwXIoBKZ+GTy0pP185beGg7Llih/NSHSV2XAs1lnznocSg== + dependencies: + "@nodelib/fs.scandir" "2.1.5" + fastq "^1.6.0" + +"@types/body-parser@*": + version "1.19.2" + resolved "https://registry.yarnpkg.com/@types/body-parser/-/body-parser-1.19.2.tgz#aea2059e28b7658639081347ac4fab3de166e6f0" + integrity sha512-ALYone6pm6QmwZoAgeyNksccT9Q4AWZQ6PvfwR37GT6r6FWUPguq6sUmNGSMV2Wr761oQoBxwGGa6DR5o1DC9g== + dependencies: + "@types/connect" "*" + "@types/node" "*" + +"@types/bonjour@^3.5.9": + version "3.5.10" + resolved "https://registry.yarnpkg.com/@types/bonjour/-/bonjour-3.5.10.tgz#0f6aadfe00ea414edc86f5d106357cda9701e275" + integrity sha512-p7ienRMiS41Nu2/igbJxxLDWrSZ0WxM8UQgCeO9KhoVF7cOVFkrKsiDr1EsJIla8vV3oEEjGcz11jc5yimhzZw== + dependencies: + "@types/node" "*" + +"@types/connect-history-api-fallback@^1.3.5": + version "1.3.5" + resolved "https://registry.yarnpkg.com/@types/connect-history-api-fallback/-/connect-history-api-fallback-1.3.5.tgz#d1f7a8a09d0ed5a57aee5ae9c18ab9b803205dae" + integrity sha512-h8QJa8xSb1WD4fpKBDcATDNGXghFj6/3GRWG6dhmRcu0RX1Ubasur2Uvx5aeEwlf0MwblEC2bMzzMQntxnw/Cw== + dependencies: + "@types/express-serve-static-core" "*" + "@types/node" "*" + +"@types/connect@*": + version "3.4.35" + resolved "https://registry.yarnpkg.com/@types/connect/-/connect-3.4.35.tgz#5fcf6ae445e4021d1fc2219a4873cc73a3bb2ad1" + integrity sha512-cdeYyv4KWoEgpBISTxWvqYsVy444DOqehiF3fM3ne10AmJ62RSyNkUnxMJXHQWRQQX2eR94m5y1IZyDwBjV9FQ== + dependencies: + "@types/node" "*" + +"@types/eslint-scope@^3.7.3": + version "3.7.3" + resolved "https://registry.yarnpkg.com/@types/eslint-scope/-/eslint-scope-3.7.3.tgz#125b88504b61e3c8bc6f870882003253005c3224" + integrity sha512-PB3ldyrcnAicT35TWPs5IcwKD8S333HMaa2VVv4+wdvebJkjWuW/xESoB8IwRcog8HYVYamb1g/R31Qv5Bx03g== + dependencies: + "@types/eslint" "*" + "@types/estree" "*" + +"@types/eslint@*": + version "8.4.1" + resolved "https://registry.yarnpkg.com/@types/eslint/-/eslint-8.4.1.tgz#c48251553e8759db9e656de3efc846954ac32304" + integrity sha512-GE44+DNEyxxh2Kc6ro/VkIj+9ma0pO0bwv9+uHSyBrikYOHr8zYcdPvnBOp1aw8s+CjRvuSx7CyWqRrNFQ59mA== + dependencies: + "@types/estree" "*" + "@types/json-schema" "*" + +"@types/estree@*", "@types/estree@^0.0.51": + version "0.0.51" + resolved "https://registry.yarnpkg.com/@types/estree/-/estree-0.0.51.tgz#cfd70924a25a3fd32b218e5e420e6897e1ac4f40" + integrity sha512-CuPgU6f3eT/XgKKPqKd/gLZV1Xmvf1a2R5POBOGQa6uv82xpls89HU5zKeVoyR8XzHd1RGNOlQlvUe3CFkjWNQ== + +"@types/express-serve-static-core@*", "@types/express-serve-static-core@^4.17.18": + version "4.17.28" + resolved "https://registry.yarnpkg.com/@types/express-serve-static-core/-/express-serve-static-core-4.17.28.tgz#c47def9f34ec81dc6328d0b1b5303d1ec98d86b8" + integrity sha512-P1BJAEAW3E2DJUlkgq4tOL3RyMunoWXqbSCygWo5ZIWTjUgN1YnaXWW4VWl/oc8vs/XoYibEGBKP0uZyF4AHig== + dependencies: + "@types/node" "*" + "@types/qs" "*" + "@types/range-parser" "*" + +"@types/express@*", "@types/express@^4.17.13": + version "4.17.13" + resolved "https://registry.yarnpkg.com/@types/express/-/express-4.17.13.tgz#a76e2995728999bab51a33fabce1d705a3709034" + integrity sha512-6bSZTPaTIACxn48l50SR+axgrqm6qXFIxrdAKaG6PaJk3+zuUr35hBlgT7vOmJcum+OEaIBLtHV/qloEAFITeA== + dependencies: + "@types/body-parser" "*" + "@types/express-serve-static-core" "^4.17.18" + "@types/qs" "*" + "@types/serve-static" "*" + +"@types/html-minifier-terser@^6.0.0": + version "6.1.0" + resolved "https://registry.yarnpkg.com/@types/html-minifier-terser/-/html-minifier-terser-6.1.0.tgz#4fc33a00c1d0c16987b1a20cf92d20614c55ac35" + integrity sha512-oh/6byDPnL1zeNXFrDXFLyZjkr1MsBG667IM792caf1L2UPOOMf65NFzjUH/ltyfwjAGfs1rsX1eftK0jC/KIg== + +"@types/http-proxy@^1.17.8": + version "1.17.8" + resolved "https://registry.yarnpkg.com/@types/http-proxy/-/http-proxy-1.17.8.tgz#968c66903e7e42b483608030ee85800f22d03f55" + integrity sha512-5kPLG5BKpWYkw/LVOGWpiq3nEVqxiN32rTgI53Sk12/xHFQ2rG3ehI9IO+O3W2QoKeyB92dJkoka8SUm6BX1pA== + dependencies: + "@types/node" "*" + +"@types/json-schema@*", "@types/json-schema@^7.0.8", "@types/json-schema@^7.0.9": + version "7.0.9" + resolved "https://registry.yarnpkg.com/@types/json-schema/-/json-schema-7.0.9.tgz#97edc9037ea0c38585320b28964dde3b39e4660d" + integrity sha512-qcUXuemtEu+E5wZSJHNxUXeCZhAfXKQ41D+duX+VYPde7xyEVZci+/oXKJL13tnRs9lR2pr4fod59GT6/X1/yQ== + +"@types/mime@^1": + version "1.3.2" + resolved "https://registry.yarnpkg.com/@types/mime/-/mime-1.3.2.tgz#93e25bf9ee75fe0fd80b594bc4feb0e862111b5a" + integrity sha512-YATxVxgRqNH6nHEIsvg6k2Boc1JHI9ZbH5iWFFv/MTkchz3b1ieGDa5T0a9RznNdI0KhVbdbWSN+KWWrQZRxTw== + +"@types/node@*": + version "17.0.21" + resolved "https://registry.yarnpkg.com/@types/node/-/node-17.0.21.tgz#864b987c0c68d07b4345845c3e63b75edd143644" + integrity sha512-DBZCJbhII3r90XbQxI8Y9IjjiiOGlZ0Hr32omXIZvwwZ7p4DMMXGrKXVyPfuoBOri9XNtL0UK69jYIBIsRX3QQ== + +"@types/prop-types@*": + version "15.7.4" + resolved "https://registry.yarnpkg.com/@types/prop-types/-/prop-types-15.7.4.tgz#fcf7205c25dff795ee79af1e30da2c9790808f11" + integrity sha512-rZ5drC/jWjrArrS8BR6SIr4cWpW09RNTYt9AMZo3Jwwif+iacXAqgVjm0B0Bv/S1jhDXKHqRVNCbACkJ89RAnQ== + +"@types/qs@*": + version "6.9.7" + resolved "https://registry.yarnpkg.com/@types/qs/-/qs-6.9.7.tgz#63bb7d067db107cc1e457c303bc25d511febf6cb" + integrity sha512-FGa1F62FT09qcrueBA6qYTrJPVDzah9a+493+o2PCXsesWHIn27G98TsSMs3WPNbZIEj4+VJf6saSFpvD+3Zsw== + +"@types/range-parser@*": + version "1.2.4" + resolved "https://registry.yarnpkg.com/@types/range-parser/-/range-parser-1.2.4.tgz#cd667bcfdd025213aafb7ca5915a932590acdcdc" + integrity sha512-EEhsLsD6UsDM1yFhAvy0Cjr6VwmpMWqFBCb9w07wVugF7w9nfajxLuVmngTIpgS6svCnm6Vaw+MZhoDCKnOfsw== + +"@types/react-dom@^16.9.8": + version "16.9.14" + resolved "https://registry.yarnpkg.com/@types/react-dom/-/react-dom-16.9.14.tgz#674b8f116645fe5266b40b525777fc6bb8eb3bcd" + integrity sha512-FIX2AVmPTGP30OUJ+0vadeIFJJ07Mh1m+U0rxfgyW34p3rTlXI+nlenvAxNn4BP36YyI9IJ/+UJ7Wu22N1pI7A== + dependencies: + "@types/react" "^16" + +"@types/react-transition-group@^4.2.0": + version "4.4.4" + resolved "https://registry.yarnpkg.com/@types/react-transition-group/-/react-transition-group-4.4.4.tgz#acd4cceaa2be6b757db61ed7b432e103242d163e" + integrity sha512-7gAPz7anVK5xzbeQW9wFBDg7G++aPLAFY0QaSMOou9rJZpbuI58WAuJrgu+qR92l61grlnCUe7AFX8KGahAgug== + dependencies: + "@types/react" "*" + +"@types/react@*": + version "17.0.39" + resolved "https://registry.yarnpkg.com/@types/react/-/react-17.0.39.tgz#d0f4cde092502a6db00a1cded6e6bf2abb7633ce" + integrity sha512-UVavlfAxDd/AgAacMa60Azl7ygyQNRwC/DsHZmKgNvPmRR5p70AJ5Q9EAmL2NWOJmeV+vVUI4IAP7GZrN8h8Ug== + dependencies: + "@types/prop-types" "*" + "@types/scheduler" "*" + csstype "^3.0.2" + +"@types/react@^16", "@types/react@^16.9.51": + version "16.14.23" + resolved "https://registry.yarnpkg.com/@types/react/-/react-16.14.23.tgz#37201b9f2324c5ff8fa4600dbf19079dfdffc880" + integrity sha512-WngBZLuSkP4IAgPi0HOsGCHo6dn3CcuLQnCfC17VbA7YBgipZiZoTOhObwl/93DsFW0Y2a/ZXeonpW4DxirEJg== + dependencies: + "@types/prop-types" "*" + "@types/scheduler" "*" + csstype "^3.0.2" + +"@types/retry@^0.12.0": + version "0.12.1" + resolved "https://registry.yarnpkg.com/@types/retry/-/retry-0.12.1.tgz#d8f1c0d0dc23afad6dc16a9e993a0865774b4065" + integrity sha512-xoDlM2S4ortawSWORYqsdU+2rxdh4LRW9ytc3zmT37RIKQh6IHyKwwtKhKis9ah8ol07DCkZxPt8BBvPjC6v4g== + +"@types/scheduler@*": + version "0.16.2" + resolved "https://registry.yarnpkg.com/@types/scheduler/-/scheduler-0.16.2.tgz#1a62f89525723dde24ba1b01b092bf5df8ad4d39" + integrity sha512-hppQEBDmlwhFAXKJX2KnWLYu5yMfi91yazPb2l+lbJiwW+wdo1gNeRA+3RgNSO39WYX2euey41KEwnqesU2Jew== + +"@types/serve-index@^1.9.1": + version "1.9.1" + resolved "https://registry.yarnpkg.com/@types/serve-index/-/serve-index-1.9.1.tgz#1b5e85370a192c01ec6cec4735cf2917337a6278" + integrity sha512-d/Hs3nWDxNL2xAczmOVZNj92YZCS6RGxfBPjKzuu/XirCgXdpKEb88dYNbrYGint6IVWLNP+yonwVAuRC0T2Dg== + dependencies: + "@types/express" "*" + +"@types/serve-static@*": + version "1.13.10" + resolved "https://registry.yarnpkg.com/@types/serve-static/-/serve-static-1.13.10.tgz#f5e0ce8797d2d7cc5ebeda48a52c96c4fa47a8d9" + integrity sha512-nCkHGI4w7ZgAdNkrEu0bv+4xNV/XDqW+DydknebMOQwkpDGx8G+HTlj7R7ABI8i8nKxVw0wtKPi1D+lPOkh4YQ== + dependencies: + "@types/mime" "^1" + "@types/node" "*" + +"@types/sockjs@^0.3.33": + version "0.3.33" + resolved "https://registry.yarnpkg.com/@types/sockjs/-/sockjs-0.3.33.tgz#570d3a0b99ac995360e3136fd6045113b1bd236f" + integrity sha512-f0KEEe05NvUnat+boPTZ0dgaLZ4SfSouXUgv5noUiefG2ajgKjmETo9ZJyuqsl7dfl2aHlLJUiki6B4ZYldiiw== + dependencies: + "@types/node" "*" + +"@types/ws@^8.2.2": + version "8.5.2" + resolved "https://registry.yarnpkg.com/@types/ws/-/ws-8.5.2.tgz#77e0c2e360e9579da930ffcfa53c5975ea3bdd26" + integrity sha512-VXI82ykONr5tacHEojnErTQk+KQSoYbW1NB6iz6wUwrNd+BqfkfggQNoNdCqhJSzbNumShPERbM+Pc5zpfhlbw== + dependencies: + "@types/node" "*" + +"@webassemblyjs/ast@1.11.1": + version "1.11.1" + resolved "https://registry.yarnpkg.com/@webassemblyjs/ast/-/ast-1.11.1.tgz#2bfd767eae1a6996f432ff7e8d7fc75679c0b6a7" + integrity sha512-ukBh14qFLjxTQNTXocdyksN5QdM28S1CxHt2rdskFyL+xFV7VremuBLVbmCePj+URalXBENx/9Lm7lnhihtCSw== + dependencies: + "@webassemblyjs/helper-numbers" "1.11.1" + "@webassemblyjs/helper-wasm-bytecode" "1.11.1" + +"@webassemblyjs/floating-point-hex-parser@1.11.1": + version "1.11.1" + resolved "https://registry.yarnpkg.com/@webassemblyjs/floating-point-hex-parser/-/floating-point-hex-parser-1.11.1.tgz#f6c61a705f0fd7a6aecaa4e8198f23d9dc179e4f" + integrity sha512-iGRfyc5Bq+NnNuX8b5hwBrRjzf0ocrJPI6GWFodBFzmFnyvrQ83SHKhmilCU/8Jv67i4GJZBMhEzltxzcNagtQ== + +"@webassemblyjs/helper-api-error@1.11.1": + version "1.11.1" + resolved "https://registry.yarnpkg.com/@webassemblyjs/helper-api-error/-/helper-api-error-1.11.1.tgz#1a63192d8788e5c012800ba6a7a46c705288fd16" + integrity sha512-RlhS8CBCXfRUR/cwo2ho9bkheSXG0+NwooXcc3PAILALf2QLdFyj7KGsKRbVc95hZnhnERon4kW/D3SZpp6Tcg== + +"@webassemblyjs/helper-buffer@1.11.1": + version "1.11.1" + resolved "https://registry.yarnpkg.com/@webassemblyjs/helper-buffer/-/helper-buffer-1.11.1.tgz#832a900eb444884cde9a7cad467f81500f5e5ab5" + integrity sha512-gwikF65aDNeeXa8JxXa2BAk+REjSyhrNC9ZwdT0f8jc4dQQeDQ7G4m0f2QCLPJiMTTO6wfDmRmj/pW0PsUvIcA== + +"@webassemblyjs/helper-numbers@1.11.1": + version "1.11.1" + resolved "https://registry.yarnpkg.com/@webassemblyjs/helper-numbers/-/helper-numbers-1.11.1.tgz#64d81da219fbbba1e3bd1bfc74f6e8c4e10a62ae" + integrity sha512-vDkbxiB8zfnPdNK9Rajcey5C0w+QJugEglN0of+kmO8l7lDb77AnlKYQF7aarZuCrv+l0UvqL+68gSDr3k9LPQ== + dependencies: + "@webassemblyjs/floating-point-hex-parser" "1.11.1" + "@webassemblyjs/helper-api-error" "1.11.1" + "@xtuc/long" "4.2.2" + +"@webassemblyjs/helper-wasm-bytecode@1.11.1": + version "1.11.1" + resolved "https://registry.yarnpkg.com/@webassemblyjs/helper-wasm-bytecode/-/helper-wasm-bytecode-1.11.1.tgz#f328241e41e7b199d0b20c18e88429c4433295e1" + integrity sha512-PvpoOGiJwXeTrSf/qfudJhwlvDQxFgelbMqtq52WWiXC6Xgg1IREdngmPN3bs4RoO83PnL/nFrxucXj1+BX62Q== + +"@webassemblyjs/helper-wasm-section@1.11.1": + version "1.11.1" + resolved "https://registry.yarnpkg.com/@webassemblyjs/helper-wasm-section/-/helper-wasm-section-1.11.1.tgz#21ee065a7b635f319e738f0dd73bfbda281c097a" + integrity sha512-10P9No29rYX1j7F3EVPX3JvGPQPae+AomuSTPiF9eBQeChHI6iqjMIwR9JmOJXwpnn/oVGDk7I5IlskuMwU/pg== + dependencies: + "@webassemblyjs/ast" "1.11.1" + "@webassemblyjs/helper-buffer" "1.11.1" + "@webassemblyjs/helper-wasm-bytecode" "1.11.1" + "@webassemblyjs/wasm-gen" "1.11.1" + +"@webassemblyjs/ieee754@1.11.1": + version "1.11.1" + resolved "https://registry.yarnpkg.com/@webassemblyjs/ieee754/-/ieee754-1.11.1.tgz#963929e9bbd05709e7e12243a099180812992614" + integrity sha512-hJ87QIPtAMKbFq6CGTkZYJivEwZDbQUgYd3qKSadTNOhVY7p+gfP6Sr0lLRVTaG1JjFj+r3YchoqRYxNH3M0GQ== + dependencies: + "@xtuc/ieee754" "^1.2.0" + +"@webassemblyjs/leb128@1.11.1": + version "1.11.1" + resolved "https://registry.yarnpkg.com/@webassemblyjs/leb128/-/leb128-1.11.1.tgz#ce814b45574e93d76bae1fb2644ab9cdd9527aa5" + integrity sha512-BJ2P0hNZ0u+Th1YZXJpzW6miwqQUGcIHT1G/sf72gLVD9DZ5AdYTqPNbHZh6K1M5VmKvFXwGSWZADz+qBWxeRw== + dependencies: + "@xtuc/long" "4.2.2" + +"@webassemblyjs/utf8@1.11.1": + version "1.11.1" + resolved "https://registry.yarnpkg.com/@webassemblyjs/utf8/-/utf8-1.11.1.tgz#d1f8b764369e7c6e6bae350e854dec9a59f0a3ff" + integrity sha512-9kqcxAEdMhiwQkHpkNiorZzqpGrodQQ2IGrHHxCy+Ozng0ofyMA0lTqiLkVs1uzTRejX+/O0EOT7KxqVPuXosQ== + +"@webassemblyjs/wasm-edit@1.11.1": + version "1.11.1" + resolved "https://registry.yarnpkg.com/@webassemblyjs/wasm-edit/-/wasm-edit-1.11.1.tgz#ad206ebf4bf95a058ce9880a8c092c5dec8193d6" + integrity sha512-g+RsupUC1aTHfR8CDgnsVRVZFJqdkFHpsHMfJuWQzWU3tvnLC07UqHICfP+4XyL2tnr1amvl1Sdp06TnYCmVkA== + dependencies: + "@webassemblyjs/ast" "1.11.1" + "@webassemblyjs/helper-buffer" "1.11.1" + "@webassemblyjs/helper-wasm-bytecode" "1.11.1" + "@webassemblyjs/helper-wasm-section" "1.11.1" + "@webassemblyjs/wasm-gen" "1.11.1" + "@webassemblyjs/wasm-opt" "1.11.1" + "@webassemblyjs/wasm-parser" "1.11.1" + "@webassemblyjs/wast-printer" "1.11.1" + +"@webassemblyjs/wasm-gen@1.11.1": + version "1.11.1" + resolved "https://registry.yarnpkg.com/@webassemblyjs/wasm-gen/-/wasm-gen-1.11.1.tgz#86c5ea304849759b7d88c47a32f4f039ae3c8f76" + integrity sha512-F7QqKXwwNlMmsulj6+O7r4mmtAlCWfO/0HdgOxSklZfQcDu0TpLiD1mRt/zF25Bk59FIjEuGAIyn5ei4yMfLhA== + dependencies: + "@webassemblyjs/ast" "1.11.1" + "@webassemblyjs/helper-wasm-bytecode" "1.11.1" + "@webassemblyjs/ieee754" "1.11.1" + "@webassemblyjs/leb128" "1.11.1" + "@webassemblyjs/utf8" "1.11.1" + +"@webassemblyjs/wasm-opt@1.11.1": + version "1.11.1" + resolved "https://registry.yarnpkg.com/@webassemblyjs/wasm-opt/-/wasm-opt-1.11.1.tgz#657b4c2202f4cf3b345f8a4c6461c8c2418985f2" + integrity sha512-VqnkNqnZlU5EB64pp1l7hdm3hmQw7Vgqa0KF/KCNO9sIpI6Fk6brDEiX+iCOYrvMuBWDws0NkTOxYEb85XQHHw== + dependencies: + "@webassemblyjs/ast" "1.11.1" + "@webassemblyjs/helper-buffer" "1.11.1" + "@webassemblyjs/wasm-gen" "1.11.1" + "@webassemblyjs/wasm-parser" "1.11.1" + +"@webassemblyjs/wasm-parser@1.11.1": + version "1.11.1" + resolved "https://registry.yarnpkg.com/@webassemblyjs/wasm-parser/-/wasm-parser-1.11.1.tgz#86ca734534f417e9bd3c67c7a1c75d8be41fb199" + integrity sha512-rrBujw+dJu32gYB7/Lup6UhdkPx9S9SnobZzRVL7VcBH9Bt9bCBLEuX/YXOOtBsOZ4NQrRykKhffRWHvigQvOA== + dependencies: + "@webassemblyjs/ast" "1.11.1" + "@webassemblyjs/helper-api-error" "1.11.1" + "@webassemblyjs/helper-wasm-bytecode" "1.11.1" + "@webassemblyjs/ieee754" "1.11.1" + "@webassemblyjs/leb128" "1.11.1" + "@webassemblyjs/utf8" "1.11.1" + +"@webassemblyjs/wast-printer@1.11.1": + version "1.11.1" + resolved "https://registry.yarnpkg.com/@webassemblyjs/wast-printer/-/wast-printer-1.11.1.tgz#d0c73beda8eec5426f10ae8ef55cee5e7084c2f0" + integrity sha512-IQboUWM4eKzWW+N/jij2sRatKMh99QEelo3Eb2q0qXkvPRISAj8Qxtmw5itwqK+TTkBuUIE45AxYPToqPtL5gg== + dependencies: + "@webassemblyjs/ast" "1.11.1" + "@xtuc/long" "4.2.2" + +"@webpack-cli/configtest@^1.1.1": + version "1.1.1" + resolved "https://registry.yarnpkg.com/@webpack-cli/configtest/-/configtest-1.1.1.tgz#9f53b1b7946a6efc2a749095a4f450e2932e8356" + integrity sha512-1FBc1f9G4P/AxMqIgfZgeOTuRnwZMten8E7zap5zgpPInnCrP8D4Q81+4CWIch8i/Nf7nXjP0v6CjjbHOrXhKg== + +"@webpack-cli/info@^1.4.1": + version "1.4.1" + resolved "https://registry.yarnpkg.com/@webpack-cli/info/-/info-1.4.1.tgz#2360ea1710cbbb97ff156a3f0f24556e0fc1ebea" + integrity sha512-PKVGmazEq3oAo46Q63tpMr4HipI3OPfP7LiNOEJg963RMgT0rqheag28NCML0o3GIzA3DmxP1ZIAv9oTX1CUIA== + dependencies: + envinfo "^7.7.3" + +"@webpack-cli/serve@^1.6.1": + version "1.6.1" + resolved "https://registry.yarnpkg.com/@webpack-cli/serve/-/serve-1.6.1.tgz#0de2875ac31b46b6c5bb1ae0a7d7f0ba5678dffe" + integrity sha512-gNGTiTrjEVQ0OcVnzsRSqTxaBSr+dmTfm+qJsCDluky8uhdLWep7Gcr62QsAKHTMxjCS/8nEITsmFAhfIx+QSw== + +"@xtuc/ieee754@^1.2.0": + version "1.2.0" + resolved "https://registry.yarnpkg.com/@xtuc/ieee754/-/ieee754-1.2.0.tgz#eef014a3145ae477a1cbc00cd1e552336dceb790" + integrity sha512-DX8nKgqcGwsc0eJSqYt5lwP4DH5FlHnmuWWBRy7X0NcaGR0ZtuyeESgMwTYVEtxmsNGY+qit4QYT/MIYTOTPeA== + +"@xtuc/long@4.2.2": + version "4.2.2" + resolved "https://registry.yarnpkg.com/@xtuc/long/-/long-4.2.2.tgz#d291c6a4e97989b5c61d9acf396ae4fe133a718d" + integrity sha512-NuHqBY1PB/D8xU6s/thBgOAiAP7HOYDQ32+BFZILJ8ivkUkAHQnWfn6WhL79Owj1qmUnoN/YPhktdIoucipkAQ== + +accepts@~1.3.4, accepts@~1.3.5, accepts@~1.3.8: + version "1.3.8" + resolved "https://registry.yarnpkg.com/accepts/-/accepts-1.3.8.tgz#0bf0be125b67014adcb0b0921e62db7bffe16b2e" + integrity sha512-PYAthTa2m2VKxuvSD3DPC/Gy+U+sOA1LAuT8mkmRuvw+NACSaeXEQ+NHcVF7rONl6qcaxV3Uuemwawk+7+SJLw== + dependencies: + mime-types "~2.1.34" + negotiator "0.6.3" + +acorn-import-assertions@^1.7.6: + version "1.8.0" + resolved "https://registry.yarnpkg.com/acorn-import-assertions/-/acorn-import-assertions-1.8.0.tgz#ba2b5939ce62c238db6d93d81c9b111b29b855e9" + integrity sha512-m7VZ3jwz4eK6A4Vtt8Ew1/mNbP24u0FhdyfA7fSvnJR6LMdfOYnmuIrrJAgrYfYJ10F/otaHTtrtrtmHdMNzEw== + +acorn@^8.4.1, acorn@^8.5.0: + version "8.7.0" + resolved "https://registry.yarnpkg.com/acorn/-/acorn-8.7.0.tgz#90951fde0f8f09df93549481e5fc141445b791cf" + integrity sha512-V/LGr1APy+PXIwKebEWrkZPwoeoF+w1jiOBUmuxuiUIaOHtob8Qc9BTrYo7VuI5fR8tqsy+buA2WFooR5olqvQ== + +aggregate-error@^3.0.0: + version "3.1.0" + resolved "https://registry.yarnpkg.com/aggregate-error/-/aggregate-error-3.1.0.tgz#92670ff50f5359bdb7a3e0d40d0ec30c5737687a" + integrity sha512-4I7Td01quW/RpocfNayFdFVk1qSuoh0E7JrbRJ16nH01HhKFQ88INq9Sd+nd72zqRySlr9BmDA8xlEJ6vJMrYA== + dependencies: + clean-stack "^2.0.0" + indent-string "^4.0.0" + +ajv-formats@^2.1.1: + version "2.1.1" + resolved "https://registry.yarnpkg.com/ajv-formats/-/ajv-formats-2.1.1.tgz#6e669400659eb74973bbf2e33327180a0996b520" + integrity sha512-Wx0Kx52hxE7C18hkMEggYlEifqWZtYaRgouJor+WMdPnQyEK13vgEWyVNup7SoeeoLMsr4kf5h6dOW11I15MUA== + dependencies: + ajv "^8.0.0" + +ajv-keywords@^3.5.2: + version "3.5.2" + resolved "https://registry.yarnpkg.com/ajv-keywords/-/ajv-keywords-3.5.2.tgz#31f29da5ab6e00d1c2d329acf7b5929614d5014d" + integrity sha512-5p6WTN0DdTGVQk6VjcEju19IgaHudalcfabD7yhDGeA6bcQnmL+CpveLJq/3hvfwd1aof6L386Ougkx6RfyMIQ== + +ajv-keywords@^5.0.0: + version "5.1.0" + resolved "https://registry.yarnpkg.com/ajv-keywords/-/ajv-keywords-5.1.0.tgz#69d4d385a4733cdbeab44964a1170a88f87f0e16" + integrity sha512-YCS/JNFAUyr5vAuhk1DWm1CBxRHW9LbJ2ozWeemrIqpbsqKjHVxYPyi5GC0rjZIT5JxJ3virVTS8wk4i/Z+krw== + dependencies: + fast-deep-equal "^3.1.3" + +ajv@^6.12.5: + version "6.12.6" + resolved "https://registry.yarnpkg.com/ajv/-/ajv-6.12.6.tgz#baf5a62e802b07d977034586f8c3baf5adf26df4" + integrity sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g== + dependencies: + fast-deep-equal "^3.1.1" + fast-json-stable-stringify "^2.0.0" + json-schema-traverse "^0.4.1" + uri-js "^4.2.2" + +ajv@^8.0.0, ajv@^8.8.0: + version "8.10.0" + resolved "https://registry.yarnpkg.com/ajv/-/ajv-8.10.0.tgz#e573f719bd3af069017e3b66538ab968d040e54d" + integrity sha512-bzqAEZOjkrUMl2afH8dknrq5KEk2SrwdBROR+vH1EKVQTqaUbJVPdc/gEdggTMM0Se+s+Ja4ju4TlNcStKl2Hw== + dependencies: + fast-deep-equal "^3.1.1" + json-schema-traverse "^1.0.0" + require-from-string "^2.0.2" + uri-js "^4.2.2" + +ansi-html-community@^0.0.8: + version "0.0.8" + resolved "https://registry.yarnpkg.com/ansi-html-community/-/ansi-html-community-0.0.8.tgz#69fbc4d6ccbe383f9736934ae34c3f8290f1bf41" + integrity sha512-1APHAyr3+PCamwNw3bXCPp4HFLONZt/yIH0sZp0/469KWNTEy+qN5jQ3GVX6DMZ1UXAi34yVwtTeaG/HpBuuzw== + +ansi-regex@^5.0.1: + version "5.0.1" + resolved "https://registry.yarnpkg.com/ansi-regex/-/ansi-regex-5.0.1.tgz#082cb2c89c9fe8659a311a53bd6a4dc5301db304" + integrity sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ== + +ansi-regex@^6.0.1: + version "6.0.1" + resolved "https://registry.yarnpkg.com/ansi-regex/-/ansi-regex-6.0.1.tgz#3183e38fae9a65d7cb5e53945cd5897d0260a06a" + integrity sha512-n5M855fKb2SsfMIiFFoVrABHJC8QtHwVx+mHWP3QcEqBHYienj5dHSgjbxtC0WEZXYt4wcD6zrQElDPhFuZgfA== + +ansi-styles@^4.1.0: + version "4.3.0" + resolved "https://registry.yarnpkg.com/ansi-styles/-/ansi-styles-4.3.0.tgz#edd803628ae71c04c85ae7a0906edad34b648937" + integrity sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg== + dependencies: + color-convert "^2.0.1" + +antd@^4.17.0: + version "4.19.0" + resolved "https://registry.yarnpkg.com/antd/-/antd-4.19.0.tgz#1c637a4d7dde091a2299260ca89f05c29fb21f80" + integrity sha512-4Kp47+zg3j1g1lWmzFstGrmlGdHzUIvxAVXxYJKJqX+iQs++QYgcK2HF+9PBpwEwP6H6VPZCsL0LqKEflke5qg== + dependencies: + "@ant-design/colors" "^6.0.0" + "@ant-design/icons" "^4.7.0" + "@ant-design/react-slick" "~0.28.1" + "@babel/runtime" "^7.12.5" + "@ctrl/tinycolor" "^3.4.0" + classnames "^2.2.6" + copy-to-clipboard "^3.2.0" + lodash "^4.17.21" + memoize-one "^6.0.0" + moment "^2.25.3" + rc-cascader "~3.2.1" + rc-checkbox "~2.3.0" + rc-collapse "~3.1.0" + rc-dialog "~8.6.0" + rc-drawer "~4.4.2" + rc-dropdown "~3.3.2" + rc-field-form "~1.23.0" + rc-image "~5.2.5" + rc-input "^0.0.1-alpha.5" + rc-input-number "~7.3.0" + rc-mentions "~1.6.1" + rc-menu "~9.2.1" + rc-motion "^2.4.4" + rc-notification "~4.5.7" + rc-pagination "~3.1.9" + rc-picker "~2.6.4" + rc-progress "~3.2.1" + rc-rate "~2.9.0" + rc-resize-observer "^1.2.0" + rc-select "~14.0.0-alpha.15" + rc-slider "~10.0.0-alpha.4" + rc-steps "~4.1.0" + rc-switch "~3.2.0" + rc-table "~7.23.0" + rc-tabs "~11.10.0" + rc-textarea "~0.3.0" + rc-tooltip "~5.1.1" + rc-tree "~5.4.3" + rc-tree-select "~5.1.1" + rc-trigger "^5.2.10" + rc-upload "~4.3.0" + rc-util "^5.14.0" + scroll-into-view-if-needed "^2.2.25" + +anymatch@~3.1.2: + version "3.1.2" + resolved "https://registry.yarnpkg.com/anymatch/-/anymatch-3.1.2.tgz#c0557c096af32f106198f4f4e2a383537e378716" + integrity sha512-P43ePfOAIupkguHUycrc4qJ9kz8ZiuOUijaETwX7THt0Y/GNK7v0aa8rY816xWjZ7rJdA5XdMcpVFTKMq+RvWg== + dependencies: + normalize-path "^3.0.0" + picomatch "^2.0.4" + +array-flatten@1.1.1: + version "1.1.1" + resolved "https://registry.yarnpkg.com/array-flatten/-/array-flatten-1.1.1.tgz#9a5f699051b1e7073328f2a008968b64ea2955d2" + integrity sha1-ml9pkFGx5wczKPKgCJaLZOopVdI= + +array-flatten@^2.1.0: + version "2.1.2" + resolved "https://registry.yarnpkg.com/array-flatten/-/array-flatten-2.1.2.tgz#24ef80a28c1a893617e2149b0c6d0d788293b099" + integrity sha512-hNfzcOV8W4NdualtqBFPyVO+54DSJuZGY9qT4pRroB6S9e3iiido2ISIC5h9R2sPJ8H3FHCIiEnsv1lPXO3KtQ== + +array-tree-filter@^2.1.0: + version "2.1.0" + resolved "https://registry.yarnpkg.com/array-tree-filter/-/array-tree-filter-2.1.0.tgz#873ac00fec83749f255ac8dd083814b4f6329190" + integrity sha512-4ROwICNlNw/Hqa9v+rk5h22KjmzB1JGTMVKP2AKJBOCgb0yL0ASf0+YvCcLNNwquOHNX48jkeZIJ3a+oOQqKcw== + +array-union@^2.1.0: + version "2.1.0" + resolved "https://registry.yarnpkg.com/array-union/-/array-union-2.1.0.tgz#b798420adbeb1de828d84acd8a2e23d3efe85e8d" + integrity sha512-HGyxoOTYUyCM6stUe6EJgnd4EoewAI7zMdfqO+kGjnlZmBDz/cR5pf8r/cR4Wq60sL/p0IkcjUEEPwS3GFrIyw== + +async-validator@^4.0.2: + version "4.0.7" + resolved "https://registry.yarnpkg.com/async-validator/-/async-validator-4.0.7.tgz#034a0fd2103a6b2ebf010da75183bec299247afe" + integrity sha512-Pj2IR7u8hmUEDOwB++su6baaRi+QvsgajuFB9j95foM1N2gy5HM4z60hfusIO0fBPG5uLAEl6yCJr1jNSVugEQ== + +async@^2.6.2: + version "2.6.3" + resolved "https://registry.yarnpkg.com/async/-/async-2.6.3.tgz#d72625e2344a3656e3a3ad4fa749fa83299d82ff" + integrity sha512-zflvls11DCy+dQWzTW2dzuilv8Z5X/pjfmZOWba6TNIVDm+2UDaJmXSOXlasHKfNBs8oo3M0aT50fDEWfKZjXg== + dependencies: + lodash "^4.17.14" + +balanced-match@^1.0.0: + version "1.0.2" + resolved "https://registry.yarnpkg.com/balanced-match/-/balanced-match-1.0.2.tgz#e83e3a7e3f300b34cb9d87f615fa0cbf357690ee" + integrity sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw== + +batch@0.6.1: + version "0.6.1" + resolved "https://registry.yarnpkg.com/batch/-/batch-0.6.1.tgz#dc34314f4e679318093fc760272525f94bf25c16" + integrity sha1-3DQxT05nkxgJP8dgJyUl+UvyXBY= + +big.js@^5.2.2: + version "5.2.2" + resolved "https://registry.yarnpkg.com/big.js/-/big.js-5.2.2.tgz#65f0af382f578bcdc742bd9c281e9cb2d7768328" + integrity sha512-vyL2OymJxmarO8gxMr0mhChsO9QGwhynfuu4+MHTAW6czfq9humCB7rKpUjDd9YUiDPU4mzpyupFSvOClAwbmQ== + +binary-extensions@^2.0.0: + version "2.2.0" + resolved "https://registry.yarnpkg.com/binary-extensions/-/binary-extensions-2.2.0.tgz#75f502eeaf9ffde42fc98829645be4ea76bd9e2d" + integrity sha512-jDctJ/IVQbZoJykoeHbhXpOlNBqGNcwXJKJog42E5HDPUwQTSdjCHdihjj0DlnheQ7blbT6dHOafNAiS8ooQKA== + +body-parser@1.19.2: + version "1.19.2" + resolved "https://registry.yarnpkg.com/body-parser/-/body-parser-1.19.2.tgz#4714ccd9c157d44797b8b5607d72c0b89952f26e" + integrity sha512-SAAwOxgoCKMGs9uUAUFHygfLAyaniaoun6I8mFY9pRAJL9+Kec34aU+oIjDhTycub1jozEfEwx1W1IuOYxVSFw== + dependencies: + bytes "3.1.2" + content-type "~1.0.4" + debug "2.6.9" + depd "~1.1.2" + http-errors "1.8.1" + iconv-lite "0.4.24" + on-finished "~2.3.0" + qs "6.9.7" + raw-body "2.4.3" + type-is "~1.6.18" + +bonjour@^3.5.0: + version "3.5.0" + resolved "https://registry.yarnpkg.com/bonjour/-/bonjour-3.5.0.tgz#8e890a183d8ee9a2393b3844c691a42bcf7bc9f5" + integrity sha1-jokKGD2O6aI5OzhExpGkK897yfU= + dependencies: + array-flatten "^2.1.0" + deep-equal "^1.0.1" + dns-equal "^1.0.0" + dns-txt "^2.0.2" + multicast-dns "^6.0.1" + multicast-dns-service-types "^1.1.0" + +boolbase@^1.0.0: + version "1.0.0" + resolved "https://registry.yarnpkg.com/boolbase/-/boolbase-1.0.0.tgz#68dff5fbe60c51eb37725ea9e3ed310dcc1e776e" + integrity sha1-aN/1++YMUes3cl6p4+0xDcwed24= + +brace-expansion@^1.1.7: + version "1.1.11" + resolved "https://registry.yarnpkg.com/brace-expansion/-/brace-expansion-1.1.11.tgz#3c7fcbf529d87226f3d2f52b966ff5271eb441dd" + integrity sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA== + dependencies: + balanced-match "^1.0.0" + concat-map "0.0.1" + +braces@^3.0.1, braces@~3.0.2: + version "3.0.2" + resolved "https://registry.yarnpkg.com/braces/-/braces-3.0.2.tgz#3454e1a462ee8d599e236df336cd9ea4f8afe107" + integrity sha512-b8um+L1RzM3WDSzvhm6gIz1yfTbBt6YTlcEKAvsmqCZZFw46z626lVj9j1yEPW33H5H+lBQpZMP1k8l+78Ha0A== + dependencies: + fill-range "^7.0.1" + +browserslist@^4.14.5, browserslist@^4.16.5: + version "4.20.0" + resolved "https://registry.yarnpkg.com/browserslist/-/browserslist-4.20.0.tgz#35951e3541078c125d36df76056e94738a52ebe9" + integrity sha512-bnpOoa+DownbciXj0jVGENf8VYQnE2LNWomhYuCsMmmx9Jd9lwq0WXODuwpSsp8AVdKM2/HorrzxAfbKvWTByQ== + dependencies: + caniuse-lite "^1.0.30001313" + electron-to-chromium "^1.4.76" + escalade "^3.1.1" + node-releases "^2.0.2" + picocolors "^1.0.0" + +buffer-from@^1.0.0: + version "1.1.2" + resolved "https://registry.yarnpkg.com/buffer-from/-/buffer-from-1.1.2.tgz#2b146a6fd72e80b4f55d255f35ed59a3a9a41bd5" + integrity sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ== + +buffer-indexof@^1.0.0: + version "1.1.1" + resolved "https://registry.yarnpkg.com/buffer-indexof/-/buffer-indexof-1.1.1.tgz#52fabcc6a606d1a00302802648ef68f639da268c" + integrity sha512-4/rOEg86jivtPTeOUUT61jJO1Ya1TrR/OkqCSZDyq84WJh3LuuiphBYJN+fm5xufIk4XAFcEwte/8WzC8If/1g== + +bytes@3.0.0: + version "3.0.0" + resolved "https://registry.yarnpkg.com/bytes/-/bytes-3.0.0.tgz#d32815404d689699f85a4ea4fa8755dd13a96048" + integrity sha1-0ygVQE1olpn4Wk6k+odV3ROpYEg= + +bytes@3.1.2: + version "3.1.2" + resolved "https://registry.yarnpkg.com/bytes/-/bytes-3.1.2.tgz#8b0beeb98605adf1b128fa4386403c009e0221a5" + integrity sha512-/Nf7TyzTx6S3yRJObOAV7956r8cr2+Oj8AC5dt8wSP3BQAoeX58NoHyCU8P8zGkNXStjTSi6fzO6F0pBdcYbEg== + +call-bind@^1.0.2: + version "1.0.2" + resolved "https://registry.yarnpkg.com/call-bind/-/call-bind-1.0.2.tgz#b1d4e89e688119c3c9a903ad30abb2f6a919be3c" + integrity sha512-7O+FbCihrB5WGbFYesctwmTKae6rOiIzmz1icreWJ+0aA7LJfuqhEso2T9ncpcFtzMQtzXf2QGGueWJGTYsqrA== + dependencies: + function-bind "^1.1.1" + get-intrinsic "^1.0.2" + +camel-case@^4.1.2: + version "4.1.2" + resolved "https://registry.yarnpkg.com/camel-case/-/camel-case-4.1.2.tgz#9728072a954f805228225a6deea6b38461e1bd5a" + integrity sha512-gxGWBrTT1JuMx6R+o5PTXMmUnhnVzLQ9SNutD4YqKtI6ap897t3tKECYla6gCWEkplXnlNybEkZg9GEGxKFCgw== + dependencies: + pascal-case "^3.1.2" + tslib "^2.0.3" + +caniuse-lite@^1.0.30001313: + version "1.0.30001313" + resolved "https://registry.yarnpkg.com/caniuse-lite/-/caniuse-lite-1.0.30001313.tgz#a380b079db91621e1b7120895874e2fd62ed2e2f" + integrity sha512-rI1UN0koZUiKINjysQDuRi2VeSCce3bYJNmDcj3PIKREiAmjakugBul1QSkg/fPrlULYl6oWfGg3PbgOSY9X4Q== + +chalk@^4.1.0: + version "4.1.2" + resolved "https://registry.yarnpkg.com/chalk/-/chalk-4.1.2.tgz#aac4e2b7734a740867aeb16bf02aad556a1e7a01" + integrity sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA== + dependencies: + ansi-styles "^4.1.0" + supports-color "^7.1.0" + +chokidar@^3.5.3: + version "3.5.3" + resolved "https://registry.yarnpkg.com/chokidar/-/chokidar-3.5.3.tgz#1cf37c8707b932bd1af1ae22c0432e2acd1903bd" + integrity sha512-Dr3sfKRP6oTcjf2JmUmFJfeVMvXBdegxB0iVQ5eb2V10uFJUCAS8OByZdVAyVb8xXNz3GjjTgj9kLWsZTqE6kw== + dependencies: + anymatch "~3.1.2" + braces "~3.0.2" + glob-parent "~5.1.2" + is-binary-path "~2.1.0" + is-glob "~4.0.1" + normalize-path "~3.0.0" + readdirp "~3.6.0" + optionalDependencies: + fsevents "~2.3.2" + +chrome-trace-event@^1.0.2: + version "1.0.3" + resolved "https://registry.yarnpkg.com/chrome-trace-event/-/chrome-trace-event-1.0.3.tgz#1015eced4741e15d06664a957dbbf50d041e26ac" + integrity sha512-p3KULyQg4S7NIHixdwbGX+nFHkoBiA4YQmyWtjb8XngSKV124nJmRysgAeujbUVb15vh+RvFUfCPqU7rXk+hZg== + +classnames@2.x, classnames@^2.2.1, classnames@^2.2.3, classnames@^2.2.5, classnames@^2.2.6, classnames@^2.3.1: + version "2.3.1" + resolved "https://registry.yarnpkg.com/classnames/-/classnames-2.3.1.tgz#dfcfa3891e306ec1dad105d0e88f4417b8535e8e" + integrity sha512-OlQdbZ7gLfGarSqxesMesDa5uz7KFbID8Kpq/SxIoNGDqY8lSYs0D+hhtBXhcdB3rcbXArFr7vlHheLk1voeNA== + +clean-css@^5.2.2: + version "5.2.4" + resolved "https://registry.yarnpkg.com/clean-css/-/clean-css-5.2.4.tgz#982b058f8581adb2ae062520808fb2429bd487a4" + integrity sha512-nKseG8wCzEuji/4yrgM/5cthL9oTDc5UOQyFMvW/Q53oP6gLH690o1NbuTh6Y18nujr7BxlsFuS7gXLnLzKJGg== + dependencies: + source-map "~0.6.0" + +clean-stack@^2.0.0: + version "2.2.0" + resolved "https://registry.yarnpkg.com/clean-stack/-/clean-stack-2.2.0.tgz#ee8472dbb129e727b31e8a10a427dee9dfe4008b" + integrity sha512-4diC9HaTE+KRAMWhDhrGOECgWZxoevMc5TlkObMqNSsVU62PYzXZ/SMTjzyGAFF1YusgxGcSWTEXBhp0CPwQ1A== + +clone-deep@^4.0.1: + version "4.0.1" + resolved "https://registry.yarnpkg.com/clone-deep/-/clone-deep-4.0.1.tgz#c19fd9bdbbf85942b4fd979c84dcf7d5f07c2387" + integrity sha512-neHB9xuzh/wk0dIHweyAXv2aPGZIVk3pLMe+/RNzINf17fe0OG96QroktYAUm7SM1PBnzTabaLboqqxDyMU+SQ== + dependencies: + is-plain-object "^2.0.4" + kind-of "^6.0.2" + shallow-clone "^3.0.0" + +clsx@^1.0.4, clsx@^1.1.1: + version "1.1.1" + resolved "https://registry.yarnpkg.com/clsx/-/clsx-1.1.1.tgz#98b3134f9abbdf23b2663491ace13c5c03a73188" + integrity sha512-6/bPho624p3S2pMyvP5kKBPXnI3ufHLObBFCfgx+LkeR5lg2XYy2hqZqUf45ypD8COn2bhgGJSUE+l5dhNBieA== + +color-convert@^2.0.1: + version "2.0.1" + resolved "https://registry.yarnpkg.com/color-convert/-/color-convert-2.0.1.tgz#72d3a68d598c9bdb3af2ad1e84f21d896abd4de3" + integrity sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ== + dependencies: + color-name "~1.1.4" + +color-name@~1.1.4: + version "1.1.4" + resolved "https://registry.yarnpkg.com/color-name/-/color-name-1.1.4.tgz#c2a09a87acbde69543de6f63fa3995c826c536a2" + integrity sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA== + +colorette@^2.0.10, colorette@^2.0.14: + version "2.0.16" + resolved "https://registry.yarnpkg.com/colorette/-/colorette-2.0.16.tgz#713b9af84fdb000139f04546bd4a93f62a5085da" + integrity sha512-hUewv7oMjCp+wkBv5Rm0v87eJhq4woh5rSR+42YSQJKecCqgIqNkZ6lAlQms/BwHPJA5NKMRlpxPRv0n8HQW6g== + +commander@^2.20.0: + version "2.20.3" + resolved "https://registry.yarnpkg.com/commander/-/commander-2.20.3.tgz#fd485e84c03eb4881c20722ba48035e8531aeb33" + integrity sha512-GpVkmM8vF2vQUkj2LvZmD35JxeJOLCwJ9cUkugyk2nuhbv3+mJvpLYYt+0+USMxE+oj+ey/lJEnhZw75x/OMcQ== + +commander@^7.0.0: + version "7.2.0" + resolved "https://registry.yarnpkg.com/commander/-/commander-7.2.0.tgz#a36cb57d0b501ce108e4d20559a150a391d97ab7" + integrity sha512-QrWXB+ZQSVPmIWIhtEO9H+gwHaMGYiF5ChvoJ+K9ZGHG/sVsa6yiesAD1GC/x46sET00Xlwo1u49RVVVzvcSkw== + +commander@^8.3.0: + version "8.3.0" + resolved "https://registry.yarnpkg.com/commander/-/commander-8.3.0.tgz#4837ea1b2da67b9c616a67afbb0fafee567bca66" + integrity sha512-OkTL9umf+He2DZkUq8f8J9of7yL6RJKI24dVITBmNfZBmri9zYZQrKkuXiKhyfPSu8tUhnVBB1iKXevvnlR4Ww== + +compressible@~2.0.16: + version "2.0.18" + resolved "https://registry.yarnpkg.com/compressible/-/compressible-2.0.18.tgz#af53cca6b070d4c3c0750fbd77286a6d7cc46fba" + integrity sha512-AF3r7P5dWxL8MxyITRMlORQNaOA2IkAFaTr4k7BUumjPtRpGDTZpl0Pb1XCO6JeDCBdp126Cgs9sMxqSjgYyRg== + dependencies: + mime-db ">= 1.43.0 < 2" + +compression@^1.7.4: + version "1.7.4" + resolved "https://registry.yarnpkg.com/compression/-/compression-1.7.4.tgz#95523eff170ca57c29a0ca41e6fe131f41e5bb8f" + integrity sha512-jaSIDzP9pZVS4ZfQ+TzvtiWhdpFhE2RDHz8QJkpX9SIpLq88VueF5jJw6t+6CUQcAoA6t+x89MLrWAqpfDE8iQ== + dependencies: + accepts "~1.3.5" + bytes "3.0.0" + compressible "~2.0.16" + debug "2.6.9" + on-headers "~1.0.2" + safe-buffer "5.1.2" + vary "~1.1.2" + +compute-scroll-into-view@^1.0.17: + version "1.0.17" + resolved "https://registry.yarnpkg.com/compute-scroll-into-view/-/compute-scroll-into-view-1.0.17.tgz#6a88f18acd9d42e9cf4baa6bec7e0522607ab7ab" + integrity sha512-j4dx+Fb0URmzbwwMUrhqWM2BEWHdFGx+qZ9qqASHRPqvTYdqvWnHg0H1hIbcyLnvgnoNAVMlwkepyqM3DaIFUg== + +concat-map@0.0.1: + version "0.0.1" + resolved "https://registry.yarnpkg.com/concat-map/-/concat-map-0.0.1.tgz#d8a96bd77fd68df7793a73036a3ba0d5405d477b" + integrity sha1-2Klr13/Wjfd5OnMDajug1UBdR3s= + +connect-history-api-fallback@^1.6.0: + version "1.6.0" + resolved "https://registry.yarnpkg.com/connect-history-api-fallback/-/connect-history-api-fallback-1.6.0.tgz#8b32089359308d111115d81cad3fceab888f97bc" + integrity sha512-e54B99q/OUoH64zYYRf3HBP5z24G38h5D3qXu23JGRoigpX5Ss4r9ZnDk3g0Z8uQC2x2lPaJ+UlWBc1ZWBWdLg== + +content-disposition@0.5.4: + version "0.5.4" + resolved "https://registry.yarnpkg.com/content-disposition/-/content-disposition-0.5.4.tgz#8b82b4efac82512a02bb0b1dcec9d2c5e8eb5bfe" + integrity sha512-FveZTNuGw04cxlAiWbzi6zTAL/lhehaWbTtgluJh4/E95DqMwTmha3KZN1aAWA8cFIhHzMZUvLevkw5Rqk+tSQ== + dependencies: + safe-buffer "5.2.1" + +content-type@~1.0.4: + version "1.0.4" + resolved "https://registry.yarnpkg.com/content-type/-/content-type-1.0.4.tgz#e138cc75e040c727b1966fe5e5f8c9aee256fe3b" + integrity sha512-hIP3EEPs8tB9AT1L+NUqtwOAps4mk2Zob89MWXMHjHWg9milF/j4osnnQLXBCBFBk/tvIG/tUc9mOUJiPBhPXA== + +cookie-signature@1.0.6: + version "1.0.6" + resolved "https://registry.yarnpkg.com/cookie-signature/-/cookie-signature-1.0.6.tgz#e303a882b342cc3ee8ca513a79999734dab3ae2c" + integrity sha1-4wOogrNCzD7oylE6eZmXNNqzriw= + +cookie@0.4.2: + version "0.4.2" + resolved "https://registry.yarnpkg.com/cookie/-/cookie-0.4.2.tgz#0e41f24de5ecf317947c82fc789e06a884824432" + integrity sha512-aSWTXFzaKWkvHO1Ny/s+ePFpvKsPnjc551iI41v3ny/ow6tBG5Vd+FuqGNhh1LxOmVzOlGUriIlOaokOvhaStA== + +copy-to-clipboard@^3.2.0: + version "3.3.1" + resolved "https://registry.yarnpkg.com/copy-to-clipboard/-/copy-to-clipboard-3.3.1.tgz#115aa1a9998ffab6196f93076ad6da3b913662ae" + integrity sha512-i13qo6kIHTTpCm8/Wup+0b1mVWETvu2kIMzKoK8FpkLkFxlt0znUAHcMzox+T8sPlqtZXq3CulEjQHsYiGFJUw== + dependencies: + toggle-selection "^1.0.6" + +core-util-is@~1.0.0: + version "1.0.3" + resolved "https://registry.yarnpkg.com/core-util-is/-/core-util-is-1.0.3.tgz#a6042d3634c2b27e9328f837b965fac83808db85" + integrity sha512-ZQBvi1DcpJ4GDqanjucZ2Hj3wEO5pZDS89BWbkcrvdxksJorwUDDZamX9ldFkp9aw2lmBDLgkObEA4DWNJ9FYQ== + +cross-env@^7.0.2: + version "7.0.3" + resolved "https://registry.yarnpkg.com/cross-env/-/cross-env-7.0.3.tgz#865264b29677dc015ba8418918965dd232fc54cf" + integrity sha512-+/HKd6EgcQCJGh2PSjZuUitQBQynKor4wrFbRg4DtAgS1aWO+gU52xpH7M9ScGgXSYmAVS9bIJ8EzuaGw0oNAw== + dependencies: + cross-spawn "^7.0.1" + +cross-spawn@^7.0.1, cross-spawn@^7.0.3: + version "7.0.3" + resolved "https://registry.yarnpkg.com/cross-spawn/-/cross-spawn-7.0.3.tgz#f73a85b9d5d41d045551c177e2882d4ac85728a6" + integrity sha512-iRDPJKUPVEND7dHPO8rkbOnPpyDygcDFtWjpeWNCgy8WP2rXcxXL8TskReQl6OrB2G7+UJrags1q15Fudc7G6w== + dependencies: + path-key "^3.1.0" + shebang-command "^2.0.0" + which "^2.0.1" + +css-loader@^5.2.4: + version "5.2.7" + resolved "https://registry.yarnpkg.com/css-loader/-/css-loader-5.2.7.tgz#9b9f111edf6fb2be5dc62525644cbc9c232064ae" + integrity sha512-Q7mOvpBNBG7YrVGMxRxcBJZFL75o+cH2abNASdibkj/fffYD8qWbInZrD0S9ccI6vZclF3DsHE7njGlLtaHbhg== + dependencies: + icss-utils "^5.1.0" + loader-utils "^2.0.0" + postcss "^8.2.15" + postcss-modules-extract-imports "^3.0.0" + postcss-modules-local-by-default "^4.0.0" + postcss-modules-scope "^3.0.0" + postcss-modules-values "^4.0.0" + postcss-value-parser "^4.1.0" + schema-utils "^3.0.0" + semver "^7.3.5" + +css-select@^4.1.3: + version "4.2.1" + resolved "https://registry.yarnpkg.com/css-select/-/css-select-4.2.1.tgz#9e665d6ae4c7f9d65dbe69d0316e3221fb274cdd" + integrity sha512-/aUslKhzkTNCQUB2qTX84lVmfia9NyjP3WpDGtj/WxhwBzWBYUV3DgUpurHTme8UTPcPlAD1DJ+b0nN/t50zDQ== + dependencies: + boolbase "^1.0.0" + css-what "^5.1.0" + domhandler "^4.3.0" + domutils "^2.8.0" + nth-check "^2.0.1" + +css-vendor@^2.0.8: + version "2.0.8" + resolved "https://registry.yarnpkg.com/css-vendor/-/css-vendor-2.0.8.tgz#e47f91d3bd3117d49180a3c935e62e3d9f7f449d" + integrity sha512-x9Aq0XTInxrkuFeHKbYC7zWY8ai7qJ04Kxd9MnvbC1uO5DagxoHQjm4JvG+vCdXOoFtCjbL2XSZfxmoYa9uQVQ== + dependencies: + "@babel/runtime" "^7.8.3" + is-in-browser "^1.0.2" + +css-what@^5.1.0: + version "5.1.0" + resolved "https://registry.yarnpkg.com/css-what/-/css-what-5.1.0.tgz#3f7b707aadf633baf62c2ceb8579b545bb40f7fe" + integrity sha512-arSMRWIIFY0hV8pIxZMEfmMI47Wj3R/aWpZDDxWYCPEiOMv6tfOrnpDtgxBYPEQD4V0Y/958+1TdC3iWTFcUPw== + +cssesc@^3.0.0: + version "3.0.0" + resolved "https://registry.yarnpkg.com/cssesc/-/cssesc-3.0.0.tgz#37741919903b868565e1c09ea747445cd18983ee" + integrity sha512-/Tb/JcjK111nNScGob5MNtsntNM1aCNUDipB/TkwZFhyDrrE47SOx/18wF2bbjgc3ZzCSKW1T5nt5EbFoAz/Vg== + +csstype@^2.5.2: + version "2.6.20" + resolved "https://registry.yarnpkg.com/csstype/-/csstype-2.6.20.tgz#9229c65ea0b260cf4d3d997cb06288e36a8d6dda" + integrity sha512-/WwNkdXfckNgw6S5R125rrW8ez139lBHWouiBvX8dfMFtcn6V81REDqnH7+CRpRipfYlyU1CmOnOxrmGcFOjeA== + +csstype@^3.0.2: + version "3.0.11" + resolved "https://registry.yarnpkg.com/csstype/-/csstype-3.0.11.tgz#d66700c5eacfac1940deb4e3ee5642792d85cd33" + integrity sha512-sa6P2wJ+CAbgyy4KFssIb/JNMLxFvKF1pCYCSXS8ZMuqZnMsrxqI2E5sPyoTpxoPU/gVZMzr2zjOfg8GIZOMsw== + +date-fns@2.x: + version "2.28.0" + resolved "https://registry.yarnpkg.com/date-fns/-/date-fns-2.28.0.tgz#9570d656f5fc13143e50c975a3b6bbeb46cd08b2" + integrity sha512-8d35hViGYx/QH0icHYCeLmsLmMUheMmTyV9Fcm6gvNwdw31yXXH+O85sOBJ+OLnLQMKZowvpKb6FgMIQjcpvQw== + +dayjs@1.x: + version "1.10.8" + resolved "https://registry.yarnpkg.com/dayjs/-/dayjs-1.10.8.tgz#267df4bc6276fcb33c04a6735287e3f429abec41" + integrity sha512-wbNwDfBHHur9UOzNUjeKUOJ0fCb0a52Wx0xInmQ7Y8FstyajiV1NmK1e00cxsr9YrE9r7yAChE0VvpuY5Rnlow== + +debug@2.6.9: + version "2.6.9" + resolved "https://registry.yarnpkg.com/debug/-/debug-2.6.9.tgz#5d128515df134ff327e90a4c93f4e077a536341f" + integrity sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA== + dependencies: + ms "2.0.0" + +debug@^3.1.1: + version "3.2.7" + resolved "https://registry.yarnpkg.com/debug/-/debug-3.2.7.tgz#72580b7e9145fb39b6676f9c5e5fb100b934179a" + integrity sha512-CFjzYYAi4ThfiQvizrFQevTTXHtnCqWfe7x1AhgEscTz6ZbLbfoLRLPugTQyBth6f8ZERVUSyWHFD/7Wu4t1XQ== + dependencies: + ms "^2.1.1" + +debug@^4.1.0: + version "4.3.3" + resolved "https://registry.yarnpkg.com/debug/-/debug-4.3.3.tgz#04266e0b70a98d4462e6e288e38259213332b664" + integrity sha512-/zxw5+vh1Tfv+4Qn7a5nsbcJKPaSvCDhojn6FEl9vupwK2VCSDtEiEtqr8DFtzYFOdz63LBkxec7DYuc2jon6Q== + dependencies: + ms "2.1.2" + +deep-equal@^1.0.1: + version "1.1.1" + resolved "https://registry.yarnpkg.com/deep-equal/-/deep-equal-1.1.1.tgz#b5c98c942ceffaf7cb051e24e1434a25a2e6076a" + integrity sha512-yd9c5AdiqVcR+JjcwUQb9DkhJc8ngNr0MahEBGvDiJw8puWab2yZlh+nkasOnZP+EGTAP6rRp2JzJhJZzvNF8g== + dependencies: + is-arguments "^1.0.4" + is-date-object "^1.0.1" + is-regex "^1.0.4" + object-is "^1.0.1" + object-keys "^1.1.1" + regexp.prototype.flags "^1.2.0" + +default-gateway@^6.0.3: + version "6.0.3" + resolved "https://registry.yarnpkg.com/default-gateway/-/default-gateway-6.0.3.tgz#819494c888053bdb743edbf343d6cdf7f2943a71" + integrity sha512-fwSOJsbbNzZ/CUFpqFBqYfYNLj1NbMPm8MMCIzHjC83iSJRBEGmDUxU+WP661BaBQImeC2yHwXtz+P/O9o+XEg== + dependencies: + execa "^5.0.0" + +define-lazy-prop@^2.0.0: + version "2.0.0" + resolved "https://registry.yarnpkg.com/define-lazy-prop/-/define-lazy-prop-2.0.0.tgz#3f7ae421129bcaaac9bc74905c98a0009ec9ee7f" + integrity sha512-Ds09qNh8yw3khSjiJjiUInaGX9xlqZDY7JVryGxdxV7NPeuqQfplOpQ66yJFZut3jLa5zOwkXw1g9EI2uKh4Og== + +define-properties@^1.1.3: + version "1.1.3" + resolved "https://registry.yarnpkg.com/define-properties/-/define-properties-1.1.3.tgz#cf88da6cbee26fe6db7094f61d870cbd84cee9f1" + integrity sha512-3MqfYKj2lLzdMSf8ZIZE/V+Zuy+BgD6f164e8K2w7dgnpKArBDerGYpM46IYYcjnkdPNMjPk9A6VFB8+3SKlXQ== + dependencies: + object-keys "^1.0.12" + +del@^6.0.0: + version "6.0.0" + resolved "https://registry.yarnpkg.com/del/-/del-6.0.0.tgz#0b40d0332cea743f1614f818be4feb717714c952" + integrity sha512-1shh9DQ23L16oXSZKB2JxpL7iMy2E0S9d517ptA1P8iw0alkPtQcrKH7ru31rYtKwF499HkTu+DRzq3TCKDFRQ== + dependencies: + globby "^11.0.1" + graceful-fs "^4.2.4" + is-glob "^4.0.1" + is-path-cwd "^2.2.0" + is-path-inside "^3.0.2" + p-map "^4.0.0" + rimraf "^3.0.2" + slash "^3.0.0" + +depd@~1.1.2: + version "1.1.2" + resolved "https://registry.yarnpkg.com/depd/-/depd-1.1.2.tgz#9bcd52e14c097763e749b274c4346ed2e560b5a9" + integrity sha1-m81S4UwJd2PnSbJ0xDRu0uVgtak= + +destroy@~1.0.4: + version "1.0.4" + resolved "https://registry.yarnpkg.com/destroy/-/destroy-1.0.4.tgz#978857442c44749e4206613e37946205826abd80" + integrity sha1-l4hXRCxEdJ5CBmE+N5RiBYJqvYA= + +detect-node@^2.0.4: + version "2.1.0" + resolved "https://registry.yarnpkg.com/detect-node/-/detect-node-2.1.0.tgz#c9c70775a49c3d03bc2c06d9a73be550f978f8b1" + integrity sha512-T0NIuQpnTvFDATNuHN5roPwSBG83rFsuO+MXXH9/3N1eFbn4wcPjttvjMLEPWJ0RGUYgQE7cGgS3tNxbqCGM7g== + +dir-glob@^3.0.1: + version "3.0.1" + resolved "https://registry.yarnpkg.com/dir-glob/-/dir-glob-3.0.1.tgz#56dbf73d992a4a93ba1584f4534063fd2e41717f" + integrity sha512-WkrWp9GR4KXfKGYzOLmTuGVi1UWFfws377n9cc55/tb6DuqyF6pcQ5AbiHEshaDpY9v6oaSr2XCDidGmMwdzIA== + dependencies: + path-type "^4.0.0" + +dns-equal@^1.0.0: + version "1.0.0" + resolved "https://registry.yarnpkg.com/dns-equal/-/dns-equal-1.0.0.tgz#b39e7f1da6eb0a75ba9c17324b34753c47e0654d" + integrity sha1-s55/HabrCnW6nBcySzR1PEfgZU0= + +dns-packet@^1.3.1: + version "1.3.4" + resolved "https://registry.yarnpkg.com/dns-packet/-/dns-packet-1.3.4.tgz#e3455065824a2507ba886c55a89963bb107dec6f" + integrity sha512-BQ6F4vycLXBvdrJZ6S3gZewt6rcrks9KBgM9vrhW+knGRqc8uEdT7fuCwloc7nny5xNoMJ17HGH0R/6fpo8ECA== + dependencies: + ip "^1.1.0" + safe-buffer "^5.0.1" + +dns-txt@^2.0.2: + version "2.0.2" + resolved "https://registry.yarnpkg.com/dns-txt/-/dns-txt-2.0.2.tgz#b91d806f5d27188e4ab3e7d107d881a1cc4642b6" + integrity sha1-uR2Ab10nGI5Ks+fRB9iBocxGQrY= + dependencies: + buffer-indexof "^1.0.0" + +dom-align@^1.7.0: + version "1.12.2" + resolved "https://registry.yarnpkg.com/dom-align/-/dom-align-1.12.2.tgz#0f8164ebd0c9c21b0c790310493cd855892acd4b" + integrity sha512-pHuazgqrsTFrGU2WLDdXxCFabkdQDx72ddkraZNih1KsMcN5qsRSTR9O4VJRlwTPCPb5COYg3LOfiMHHcPInHg== + +dom-converter@^0.2.0: + version "0.2.0" + resolved "https://registry.yarnpkg.com/dom-converter/-/dom-converter-0.2.0.tgz#6721a9daee2e293682955b6afe416771627bb768" + integrity sha512-gd3ypIPfOMr9h5jIKq8E3sHOTCjeirnl0WK5ZdS1AW0Odt0b1PaWaHdJ4Qk4klv+YB9aJBS7mESXjFoDQPu6DA== + dependencies: + utila "~0.4" + +dom-helpers@^5.0.1: + version "5.2.1" + resolved "https://registry.yarnpkg.com/dom-helpers/-/dom-helpers-5.2.1.tgz#d9400536b2bf8225ad98fe052e029451ac40e902" + integrity sha512-nRCa7CK3VTrM2NmGkIy4cbK7IZlgBE/PYMn55rrXefr5xXDP0LdtfPnblFDoVdcAfslJ7or6iqAUnx0CCGIWQA== + dependencies: + "@babel/runtime" "^7.8.7" + csstype "^3.0.2" + +dom-serializer@^1.0.1: + version "1.3.2" + resolved "https://registry.yarnpkg.com/dom-serializer/-/dom-serializer-1.3.2.tgz#6206437d32ceefaec7161803230c7a20bc1b4d91" + integrity sha512-5c54Bk5Dw4qAxNOI1pFEizPSjVsx5+bpJKmL2kPn8JhBUq2q09tTCa3mjijun2NfK78NMouDYNMBkOrPZiS+ig== + dependencies: + domelementtype "^2.0.1" + domhandler "^4.2.0" + entities "^2.0.0" + +domelementtype@^2.0.1, domelementtype@^2.2.0: + version "2.2.0" + resolved "https://registry.yarnpkg.com/domelementtype/-/domelementtype-2.2.0.tgz#9a0b6c2782ed6a1c7323d42267183df9bd8b1d57" + integrity sha512-DtBMo82pv1dFtUmHyr48beiuq792Sxohr+8Hm9zoxklYPfa6n0Z3Byjj2IV7bmr2IyqClnqEQhfgHJJ5QF0R5A== + +domhandler@^4.0.0, domhandler@^4.2.0, domhandler@^4.3.0: + version "4.3.0" + resolved "https://registry.yarnpkg.com/domhandler/-/domhandler-4.3.0.tgz#16c658c626cf966967e306f966b431f77d4a5626" + integrity sha512-fC0aXNQXqKSFTr2wDNZDhsEYjCiYsDWl3D01kwt25hm1YIPyDGHvvi3rw+PLqHAl/m71MaiF7d5zvBr0p5UB2g== + dependencies: + domelementtype "^2.2.0" + +domutils@^2.5.2, domutils@^2.8.0: + version "2.8.0" + resolved "https://registry.yarnpkg.com/domutils/-/domutils-2.8.0.tgz#4437def5db6e2d1f5d6ee859bd95ca7d02048135" + integrity sha512-w96Cjofp72M5IIhpjgobBimYEfoPjx1Vx0BSX9P30WBdZW2WIKU0T1Bd0kz2eNZ9ikjKgHbEyKx8BB6H1L3h3A== + dependencies: + dom-serializer "^1.0.1" + domelementtype "^2.2.0" + domhandler "^4.2.0" + +dot-case@^3.0.4: + version "3.0.4" + resolved "https://registry.yarnpkg.com/dot-case/-/dot-case-3.0.4.tgz#9b2b670d00a431667a8a75ba29cd1b98809ce751" + integrity sha512-Kv5nKlh6yRrdrGvxeJ2e5y2eRUpkUosIW4A2AS38zwSz27zu7ufDwQPi5Jhs3XAlGNetl3bmnGhQsMtkKJnj3w== + dependencies: + no-case "^3.0.4" + tslib "^2.0.3" + +ee-first@1.1.1: + version "1.1.1" + resolved "https://registry.yarnpkg.com/ee-first/-/ee-first-1.1.1.tgz#590c61156b0ae2f4f0255732a158b266bc56b21d" + integrity sha1-WQxhFWsK4vTwJVcyoViyZrxWsh0= + +electron-to-chromium@^1.4.76: + version "1.4.76" + resolved "https://registry.yarnpkg.com/electron-to-chromium/-/electron-to-chromium-1.4.76.tgz#a0494baedaf51094b1c172999919becd9975a934" + integrity sha512-3Vftv7cenJtQb+k00McEBZ2vVmZ/x+HEF7pcZONZIkOsESqAqVuACmBxMv0JhzX7u0YltU0vSqRqgBSTAhFUjA== + +emojis-list@^3.0.0: + version "3.0.0" + resolved "https://registry.yarnpkg.com/emojis-list/-/emojis-list-3.0.0.tgz#5570662046ad29e2e916e71aae260abdff4f6a78" + integrity sha512-/kyM18EfinwXZbno9FyUGeFh87KC8HRQBQGildHZbEuRyWFOmv1U10o9BBp8XVZDVNNuQKyIGIu5ZYAAXJ0V2Q== + +encodeurl@~1.0.2: + version "1.0.2" + resolved "https://registry.yarnpkg.com/encodeurl/-/encodeurl-1.0.2.tgz#ad3ff4c86ec2d029322f5a02c3a9a606c95b3f59" + integrity sha1-rT/0yG7C0CkyL1oCw6mmBslbP1k= + +enhanced-resolve@^4.0.0: + version "4.5.0" + resolved "https://registry.yarnpkg.com/enhanced-resolve/-/enhanced-resolve-4.5.0.tgz#2f3cfd84dbe3b487f18f2db2ef1e064a571ca5ec" + integrity sha512-Nv9m36S/vxpsI+Hc4/ZGRs0n9mXqSWGGq49zxb/cJfPAQMbUtttJAlNPS4AQzaBdw/pKskw5bMbekT/Y7W/Wlg== + dependencies: + graceful-fs "^4.1.2" + memory-fs "^0.5.0" + tapable "^1.0.0" + +enhanced-resolve@^5.9.2: + version "5.9.2" + resolved "https://registry.yarnpkg.com/enhanced-resolve/-/enhanced-resolve-5.9.2.tgz#0224dcd6a43389ebfb2d55efee517e5466772dd9" + integrity sha512-GIm3fQfwLJ8YZx2smuHpBKkXC1yOk+OBEmKckVyL0i/ea8mqDEykK3ld5dgH1QYPNyT/lIllxV2LULnxCHaHkA== + dependencies: + graceful-fs "^4.2.4" + tapable "^2.2.0" + +entities@^2.0.0: + version "2.2.0" + resolved "https://registry.yarnpkg.com/entities/-/entities-2.2.0.tgz#098dc90ebb83d8dffa089d55256b351d34c4da55" + integrity sha512-p92if5Nz619I0w+akJrLZH0MX0Pb5DX39XOwQTtXSdQQOaYH03S1uIQp4mhOZtAXrxq4ViO67YTiLBo2638o9A== + +envinfo@^7.7.3: + version "7.8.1" + resolved "https://registry.yarnpkg.com/envinfo/-/envinfo-7.8.1.tgz#06377e3e5f4d379fea7ac592d5ad8927e0c4d475" + integrity sha512-/o+BXHmB7ocbHEAs6F2EnG0ogybVVUdkRunTT2glZU9XAaGmhqskrvKwqXuDfNjEO0LZKWdejEEpnq8aM0tOaw== + +errno@^0.1.3: + version "0.1.8" + resolved "https://registry.yarnpkg.com/errno/-/errno-0.1.8.tgz#8bb3e9c7d463be4976ff888f76b4809ebc2e811f" + integrity sha512-dJ6oBr5SQ1VSd9qkk7ByRgb/1SH4JZjCHSW/mr63/QcXO9zLVxvJ6Oy13nio03rxpSnVDDjFor75SjVeZWPW/A== + dependencies: + prr "~1.0.1" + +es-module-lexer@^0.9.0: + version "0.9.3" + resolved "https://registry.yarnpkg.com/es-module-lexer/-/es-module-lexer-0.9.3.tgz#6f13db00cc38417137daf74366f535c8eb438f19" + integrity sha512-1HQ2M2sPtxwnvOvT1ZClHyQDiggdNjURWpY2we6aMKCQiUVxTmVs2UYPLIrD84sS+kMdUwfBSylbJPwNnBrnHQ== + +escalade@^3.1.1: + version "3.1.1" + resolved "https://registry.yarnpkg.com/escalade/-/escalade-3.1.1.tgz#d8cfdc7000965c5a0174b4a82eaa5c0552742e40" + integrity sha512-k0er2gUkLf8O0zKJiAhmkTnJlTvINGv7ygDNPbeIsX/TJjGJZHuh9B2UxbsaEkmlEo9MfhrSzmhIlhRlI2GXnw== + +escape-html@~1.0.3: + version "1.0.3" + resolved "https://registry.yarnpkg.com/escape-html/-/escape-html-1.0.3.tgz#0258eae4d3d0c0974de1c169188ef0051d1d1988" + integrity sha1-Aljq5NPQwJdN4cFpGI7wBR0dGYg= + +eslint-scope@5.1.1: + version "5.1.1" + resolved "https://registry.yarnpkg.com/eslint-scope/-/eslint-scope-5.1.1.tgz#e786e59a66cb92b3f6c1fb0d508aab174848f48c" + integrity sha512-2NxwbF/hZ0KpepYN0cNbo+FN6XoK7GaHlQhgx/hIZl6Va0bF45RQOOwhLIy8lQDbuCiadSLCBnH2CFYquit5bw== + dependencies: + esrecurse "^4.3.0" + estraverse "^4.1.1" + +esrecurse@^4.3.0: + version "4.3.0" + resolved "https://registry.yarnpkg.com/esrecurse/-/esrecurse-4.3.0.tgz#7ad7964d679abb28bee72cec63758b1c5d2c9921" + integrity sha512-KmfKL3b6G+RXvP8N1vr3Tq1kL/oCFgn2NYXEtqP8/L3pKapUA4G8cFVaoF3SU323CD4XypR/ffioHmkti6/Tag== + dependencies: + estraverse "^5.2.0" + +estraverse@^4.1.1: + version "4.3.0" + resolved "https://registry.yarnpkg.com/estraverse/-/estraverse-4.3.0.tgz#398ad3f3c5a24948be7725e83d11a7de28cdbd1d" + integrity sha512-39nnKffWz8xN1BU/2c79n9nB9HDzo0niYUqx6xyqUnyoAnQyyWpOTdZEeiCch8BBu515t4wp9ZmgVfVhn9EBpw== + +estraverse@^5.2.0: + version "5.3.0" + resolved "https://registry.yarnpkg.com/estraverse/-/estraverse-5.3.0.tgz#2eea5290702f26ab8fe5370370ff86c965d21123" + integrity sha512-MMdARuVEQziNTeJD8DgMqmhwR11BRQ/cBP+pLtYdSTnf3MIO8fFeiINEbX36ZdNlfU/7A9f3gUw49B3oQsvwBA== + +etag@~1.8.1: + version "1.8.1" + resolved "https://registry.yarnpkg.com/etag/-/etag-1.8.1.tgz#41ae2eeb65efa62268aebfea83ac7d79299b0887" + integrity sha1-Qa4u62XvpiJorr/qg6x9eSmbCIc= + +eventemitter3@^4.0.0: + version "4.0.7" + resolved "https://registry.yarnpkg.com/eventemitter3/-/eventemitter3-4.0.7.tgz#2de9b68f6528d5644ef5c59526a1b4a07306169f" + integrity sha512-8guHBZCwKnFhYdHr2ysuRWErTwhoN2X8XELRlrRwpmfeY2jjuUN4taQMsULKUVo1K4DvZl+0pgfyoysHxvmvEw== + +events@^3.2.0: + version "3.3.0" + resolved "https://registry.yarnpkg.com/events/-/events-3.3.0.tgz#31a95ad0a924e2d2c419a813aeb2c4e878ea7400" + integrity sha512-mQw+2fkQbALzQ7V0MY0IqdnXNOeTtP4r0lN9z7AAawCXgqea7bDii20AYrIBrFd/Hx0M2Ocz6S111CaFkUcb0Q== + +execa@^5.0.0: + version "5.1.1" + resolved "https://registry.yarnpkg.com/execa/-/execa-5.1.1.tgz#f80ad9cbf4298f7bd1d4c9555c21e93741c411dd" + integrity sha512-8uSpZZocAZRBAPIEINJj3Lo9HyGitllczc27Eh5YYojjMFMn8yHMDMaUHE2Jqfq05D/wucwI4JGURyXt1vchyg== + dependencies: + cross-spawn "^7.0.3" + get-stream "^6.0.0" + human-signals "^2.1.0" + is-stream "^2.0.0" + merge-stream "^2.0.0" + npm-run-path "^4.0.1" + onetime "^5.1.2" + signal-exit "^3.0.3" + strip-final-newline "^2.0.0" + +express@^4.17.1: + version "4.17.3" + resolved "https://registry.yarnpkg.com/express/-/express-4.17.3.tgz#f6c7302194a4fb54271b73a1fe7a06478c8f85a1" + integrity sha512-yuSQpz5I+Ch7gFrPCk4/c+dIBKlQUxtgwqzph132bsT6qhuzss6I8cLJQz7B3rFblzd6wtcI0ZbGltH/C4LjUg== + dependencies: + accepts "~1.3.8" + array-flatten "1.1.1" + body-parser "1.19.2" + content-disposition "0.5.4" + content-type "~1.0.4" + cookie "0.4.2" + cookie-signature "1.0.6" + debug "2.6.9" + depd "~1.1.2" + encodeurl "~1.0.2" + escape-html "~1.0.3" + etag "~1.8.1" + finalhandler "~1.1.2" + fresh "0.5.2" + merge-descriptors "1.0.1" + methods "~1.1.2" + on-finished "~2.3.0" + parseurl "~1.3.3" + path-to-regexp "0.1.7" + proxy-addr "~2.0.7" + qs "6.9.7" + range-parser "~1.2.1" + safe-buffer "5.2.1" + send "0.17.2" + serve-static "1.14.2" + setprototypeof "1.2.0" + statuses "~1.5.0" + type-is "~1.6.18" + utils-merge "1.0.1" + vary "~1.1.2" + +fast-deep-equal@^3.1.1, fast-deep-equal@^3.1.3: + version "3.1.3" + resolved "https://registry.yarnpkg.com/fast-deep-equal/-/fast-deep-equal-3.1.3.tgz#3a7d56b559d6cbc3eb512325244e619a65c6c525" + integrity sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q== + +fast-glob@^3.2.9: + version "3.2.11" + resolved "https://registry.yarnpkg.com/fast-glob/-/fast-glob-3.2.11.tgz#a1172ad95ceb8a16e20caa5c5e56480e5129c1d9" + integrity sha512-xrO3+1bxSo3ZVHAnqzyuewYT6aMFHRAd4Kcs92MAonjwQZLsK9d0SF1IyQ3k5PoirxTW0Oe/RqFgMQ6TcNE5Ew== + dependencies: + "@nodelib/fs.stat" "^2.0.2" + "@nodelib/fs.walk" "^1.2.3" + glob-parent "^5.1.2" + merge2 "^1.3.0" + micromatch "^4.0.4" + +fast-json-stable-stringify@^2.0.0: + version "2.1.0" + resolved "https://registry.yarnpkg.com/fast-json-stable-stringify/-/fast-json-stable-stringify-2.1.0.tgz#874bf69c6f404c2b5d99c481341399fd55892633" + integrity sha512-lhd/wF+Lk98HZoTCtlVraHtfh5XYijIjalXck7saUtuanSDyLMxnHhSXEDJqHxD7msR8D0uCmqlkwjCV8xvwHw== + +fastest-levenshtein@^1.0.12: + version "1.0.12" + resolved "https://registry.yarnpkg.com/fastest-levenshtein/-/fastest-levenshtein-1.0.12.tgz#9990f7d3a88cc5a9ffd1f1745745251700d497e2" + integrity sha512-On2N+BpYJ15xIC974QNVuYGMOlEVt4s0EOI3wwMqOmK1fdDY+FN/zltPV8vosq4ad4c/gJ1KHScUn/6AWIgiow== + +fastq@^1.6.0: + version "1.13.0" + resolved "https://registry.yarnpkg.com/fastq/-/fastq-1.13.0.tgz#616760f88a7526bdfc596b7cab8c18938c36b98c" + integrity sha512-YpkpUnK8od0o1hmeSc7UUs/eB/vIPWJYjKck2QKIzAf71Vm1AAQ3EbuZB3g2JIy+pg+ERD0vqI79KyZiB2e2Nw== + dependencies: + reusify "^1.0.4" + +faye-websocket@^0.11.3: + version "0.11.4" + resolved "https://registry.yarnpkg.com/faye-websocket/-/faye-websocket-0.11.4.tgz#7f0d9275cfdd86a1c963dc8b65fcc451edcbb1da" + integrity sha512-CzbClwlXAuiRQAlUyfqPgvPoNKTckTPGfwZV4ZdAhVcP2lh9KUxJg2b5GkE7XbjKQ3YJnQ9z6D9ntLAlB+tP8g== + dependencies: + websocket-driver ">=0.5.1" + +fill-range@^7.0.1: + version "7.0.1" + resolved "https://registry.yarnpkg.com/fill-range/-/fill-range-7.0.1.tgz#1919a6a7c75fe38b2c7c77e5198535da9acdda40" + integrity sha512-qOo9F+dMUmC2Lcb4BbVvnKJxTPjCm+RRpe4gDuGrzkL7mEVl/djYSu2OdQ2Pa302N4oqkSg9ir6jaLWJ2USVpQ== + dependencies: + to-regex-range "^5.0.1" + +finalhandler@~1.1.2: + version "1.1.2" + resolved "https://registry.yarnpkg.com/finalhandler/-/finalhandler-1.1.2.tgz#b7e7d000ffd11938d0fdb053506f6ebabe9f587d" + integrity sha512-aAWcW57uxVNrQZqFXjITpW3sIUQmHGG3qSb9mUah9MgMC4NeWhNOlNjXEYq3HjRAvL6arUviZGGJsBg6z0zsWA== + dependencies: + debug "2.6.9" + encodeurl "~1.0.2" + escape-html "~1.0.3" + on-finished "~2.3.0" + parseurl "~1.3.3" + statuses "~1.5.0" + unpipe "~1.0.0" + +find-up@^4.0.0: + version "4.1.0" + resolved "https://registry.yarnpkg.com/find-up/-/find-up-4.1.0.tgz#97afe7d6cdc0bc5928584b7c8d7b16e8a9aa5d19" + integrity sha512-PpOwAdQ/YlXQ2vj8a3h8IipDuYRi3wceVQQGYWxNINccq40Anw7BlsEXCMbt1Zt+OLA6Fq9suIpIWD0OsnISlw== + dependencies: + locate-path "^5.0.0" + path-exists "^4.0.0" + +flow-bin@^0.118.0: + version "0.118.0" + resolved "https://registry.yarnpkg.com/flow-bin/-/flow-bin-0.118.0.tgz#fb706364a58c682d67a2ca7df39396467dc397d1" + integrity sha512-jlbUu0XkbpXeXhan5xyTqVK1jmEKNxE8hpzznI3TThHTr76GiFwK0iRzhDo4KNy+S9h/KxHaqVhTP86vA6wHCg== + +follow-redirects@^1.0.0: + version "1.14.9" + resolved "https://registry.yarnpkg.com/follow-redirects/-/follow-redirects-1.14.9.tgz#dd4ea157de7bfaf9ea9b3fbd85aa16951f78d8d7" + integrity sha512-MQDfihBQYMcyy5dhRDJUHcw7lb2Pv/TuE6xP1vyraLukNDHKbDxDNaOE3NbCAdKQApno+GPRyo1YAp89yCjK4w== + +forwarded@0.2.0: + version "0.2.0" + resolved "https://registry.yarnpkg.com/forwarded/-/forwarded-0.2.0.tgz#2269936428aad4c15c7ebe9779a84bf0b2a81811" + integrity sha512-buRG0fpBtRHSTCOASe6hD258tEubFoRLb4ZNA6NxMVHNw2gOcwHo9wyablzMzOA5z9xA9L1KNjk/Nt6MT9aYow== + +fresh@0.5.2: + version "0.5.2" + resolved "https://registry.yarnpkg.com/fresh/-/fresh-0.5.2.tgz#3d8cadd90d976569fa835ab1f8e4b23a105605a7" + integrity sha1-PYyt2Q2XZWn6g1qx+OSyOhBWBac= + +fs-monkey@1.0.3: + version "1.0.3" + resolved "https://registry.yarnpkg.com/fs-monkey/-/fs-monkey-1.0.3.tgz#ae3ac92d53bb328efe0e9a1d9541f6ad8d48e2d3" + integrity sha512-cybjIfiiE+pTWicSCLFHSrXZ6EilF30oh91FDP9S2B051prEa7QWfrVTQm10/dDpswBDXZugPa1Ogu8Yh+HV0Q== + +fs.realpath@^1.0.0: + version "1.0.0" + resolved "https://registry.yarnpkg.com/fs.realpath/-/fs.realpath-1.0.0.tgz#1504ad2523158caa40db4a2787cb01411994ea4f" + integrity sha1-FQStJSMVjKpA20onh8sBQRmU6k8= + +fsevents@~2.3.2: + version "2.3.2" + resolved "https://registry.yarnpkg.com/fsevents/-/fsevents-2.3.2.tgz#8a526f78b8fdf4623b709e0b975c52c24c02fd1a" + integrity sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA== + +function-bind@^1.1.1: + version "1.1.1" + resolved "https://registry.yarnpkg.com/function-bind/-/function-bind-1.1.1.tgz#a56899d3ea3c9bab874bb9773b7c5ede92f4895d" + integrity sha512-yIovAzMX49sF8Yl58fSCWJ5svSLuaibPxXQJFLmBObTuCr0Mf1KiPopGM9NiFjiYBCbfaa2Fh6breQ6ANVTI0A== + +get-intrinsic@^1.0.2: + version "1.1.1" + resolved "https://registry.yarnpkg.com/get-intrinsic/-/get-intrinsic-1.1.1.tgz#15f59f376f855c446963948f0d24cd3637b4abc6" + integrity sha512-kWZrnVM42QCiEA2Ig1bG8zjoIMOgxWwYCEeNdwY6Tv/cOSeGpcoX4pXHfKUxNKVoArnrEr2e9srnAxxGIraS9Q== + dependencies: + function-bind "^1.1.1" + has "^1.0.3" + has-symbols "^1.0.1" + +get-stream@^6.0.0: + version "6.0.1" + resolved "https://registry.yarnpkg.com/get-stream/-/get-stream-6.0.1.tgz#a262d8eef67aced57c2852ad6167526a43cbf7b7" + integrity sha512-ts6Wi+2j3jQjqi70w5AlN8DFnkSwC+MqmxEzdEALB2qXZYV3X/b1CTfgPLGJNMeAWxdPfU8FO1ms3NUfaHCPYg== + +glob-parent@^5.1.2, glob-parent@~5.1.2: + version "5.1.2" + resolved "https://registry.yarnpkg.com/glob-parent/-/glob-parent-5.1.2.tgz#869832c58034fe68a4093c17dc15e8340d8401c4" + integrity sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow== + dependencies: + is-glob "^4.0.1" + +glob-to-regexp@^0.4.1: + version "0.4.1" + resolved "https://registry.yarnpkg.com/glob-to-regexp/-/glob-to-regexp-0.4.1.tgz#c75297087c851b9a578bd217dd59a92f59fe546e" + integrity sha512-lkX1HJXwyMcprw/5YUZc2s7DrpAiHB21/V+E1rHUrVNokkvB6bqMzT0VfV6/86ZNabt1k14YOIaT7nDvOX3Iiw== + +glob@^7.1.3: + version "7.2.0" + resolved "https://registry.yarnpkg.com/glob/-/glob-7.2.0.tgz#d15535af7732e02e948f4c41628bd910293f6023" + integrity sha512-lmLf6gtyrPq8tTjSmrO94wBeQbFR3HbLHbuyD69wuyQkImp2hWqMGB47OX65FBkPffO641IP9jWa1z4ivqG26Q== + dependencies: + fs.realpath "^1.0.0" + inflight "^1.0.4" + inherits "2" + minimatch "^3.0.4" + once "^1.3.0" + path-is-absolute "^1.0.0" + +globby@^11.0.1: + version "11.1.0" + resolved "https://registry.yarnpkg.com/globby/-/globby-11.1.0.tgz#bd4be98bb042f83d796f7e3811991fbe82a0d34b" + integrity sha512-jhIXaOzy1sb8IyocaruWSn1TjmnBVs8Ayhcy83rmxNJ8q2uWKCAj3CnJY+KpGSXCueAPc0i05kVvVKtP1t9S3g== + dependencies: + array-union "^2.1.0" + dir-glob "^3.0.1" + fast-glob "^3.2.9" + ignore "^5.2.0" + merge2 "^1.4.1" + slash "^3.0.0" + +graceful-fs@^4.1.2, graceful-fs@^4.2.4, graceful-fs@^4.2.6, graceful-fs@^4.2.9: + version "4.2.9" + resolved "https://registry.yarnpkg.com/graceful-fs/-/graceful-fs-4.2.9.tgz#041b05df45755e587a24942279b9d113146e1c96" + integrity sha512-NtNxqUcXgpW2iMrfqSfR73Glt39K+BLwWsPs94yR63v45T0Wbej7eRmL5cWfwEgqXnmjQp3zaJTshdRW/qC2ZQ== + +handle-thing@^2.0.0: + version "2.0.1" + resolved "https://registry.yarnpkg.com/handle-thing/-/handle-thing-2.0.1.tgz#857f79ce359580c340d43081cc648970d0bb234e" + integrity sha512-9Qn4yBxelxoh2Ow62nP+Ka/kMnOXRi8BXnRaUwezLNhqelnN49xKz4F/dPP8OYLxLxq6JDtZb2i9XznUQbNPTg== + +has-flag@^4.0.0: + version "4.0.0" + resolved "https://registry.yarnpkg.com/has-flag/-/has-flag-4.0.0.tgz#944771fd9c81c81265c4d6941860da06bb59479b" + integrity sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ== + +has-symbols@^1.0.1, has-symbols@^1.0.2: + version "1.0.3" + resolved "https://registry.yarnpkg.com/has-symbols/-/has-symbols-1.0.3.tgz#bb7b2c4349251dce87b125f7bdf874aa7c8b39f8" + integrity sha512-l3LCuF6MgDNwTDKkdYGEihYjt5pRPbEg46rtlmnSPlUbgmB8LOIrKJbYYFBSbnPaJexMKtiPO8hmeRjRz2Td+A== + +has-tostringtag@^1.0.0: + version "1.0.0" + resolved "https://registry.yarnpkg.com/has-tostringtag/-/has-tostringtag-1.0.0.tgz#7e133818a7d394734f941e73c3d3f9291e658b25" + integrity sha512-kFjcSNhnlGV1kyoGk7OXKSawH5JOb/LzUc5w9B02hOTO0dfFRjbHQKvg1d6cf3HbeUmtU9VbbV3qzZ2Teh97WQ== + dependencies: + has-symbols "^1.0.2" + +has@^1.0.3: + version "1.0.3" + resolved "https://registry.yarnpkg.com/has/-/has-1.0.3.tgz#722d7cbfc1f6aa8241f16dd814e011e1f41e8796" + integrity sha512-f2dvO0VU6Oej7RkWJGrehjbzMAjFp5/VKPp5tTpWIV4JHHZK1/BxbFRtf/siA2SWTe09caDmVtYYzWEIbBS4zw== + dependencies: + function-bind "^1.1.1" + +he@^1.2.0: + version "1.2.0" + resolved "https://registry.yarnpkg.com/he/-/he-1.2.0.tgz#84ae65fa7eafb165fddb61566ae14baf05664f0f" + integrity sha512-F/1DnUGPopORZi0ni+CvrCgHQ5FyEAHRLSApuYWMmrbSwoN2Mn/7k+Gl38gJnR7yyDZk6WLXwiGod1JOWNDKGw== + +hoist-non-react-statics@^3.3.2: + version "3.3.2" + resolved "https://registry.yarnpkg.com/hoist-non-react-statics/-/hoist-non-react-statics-3.3.2.tgz#ece0acaf71d62c2969c2ec59feff42a4b1a85b45" + integrity sha512-/gGivxi8JPKWNm/W0jSmzcMPpfpPLc3dY/6GxhX2hQ9iGj3aDfklV4ET7NjKpSinLpJ5vafa9iiGIEZg10SfBw== + dependencies: + react-is "^16.7.0" + +hpack.js@^2.1.6: + version "2.1.6" + resolved "https://registry.yarnpkg.com/hpack.js/-/hpack.js-2.1.6.tgz#87774c0949e513f42e84575b3c45681fade2a0b2" + integrity sha1-h3dMCUnlE/QuhFdbPEVoH63ioLI= + dependencies: + inherits "^2.0.1" + obuf "^1.0.0" + readable-stream "^2.0.1" + wbuf "^1.1.0" + +html-entities@^2.3.2: + version "2.3.2" + resolved "https://registry.yarnpkg.com/html-entities/-/html-entities-2.3.2.tgz#760b404685cb1d794e4f4b744332e3b00dcfe488" + integrity sha512-c3Ab/url5ksaT0WyleslpBEthOzWhrjQbg75y7XUsfSzi3Dgzt0l8w5e7DylRn15MTlMMD58dTfzddNS2kcAjQ== + +html-minifier-terser@^6.0.2: + version "6.1.0" + resolved "https://registry.yarnpkg.com/html-minifier-terser/-/html-minifier-terser-6.1.0.tgz#bfc818934cc07918f6b3669f5774ecdfd48f32ab" + integrity sha512-YXxSlJBZTP7RS3tWnQw74ooKa6L9b9i9QYXY21eUEvhZ3u9XLfv6OnFsQq6RxkhHygsaUMvYsZRV5rU/OVNZxw== + dependencies: + camel-case "^4.1.2" + clean-css "^5.2.2" + commander "^8.3.0" + he "^1.2.0" + param-case "^3.0.4" + relateurl "^0.2.7" + terser "^5.10.0" + +html-webpack-plugin@^5.3.1: + version "5.5.0" + resolved "https://registry.yarnpkg.com/html-webpack-plugin/-/html-webpack-plugin-5.5.0.tgz#c3911936f57681c1f9f4d8b68c158cd9dfe52f50" + integrity sha512-sy88PC2cRTVxvETRgUHFrL4No3UxvcH8G1NepGhqaTT+GXN2kTamqasot0inS5hXeg1cMbFDt27zzo9p35lZVw== + dependencies: + "@types/html-minifier-terser" "^6.0.0" + html-minifier-terser "^6.0.2" + lodash "^4.17.21" + pretty-error "^4.0.0" + tapable "^2.0.0" + +htmlparser2@^6.1.0: + version "6.1.0" + resolved "https://registry.yarnpkg.com/htmlparser2/-/htmlparser2-6.1.0.tgz#c4d762b6c3371a05dbe65e94ae43a9f845fb8fb7" + integrity sha512-gyyPk6rgonLFEDGoeRgQNaEUvdJ4ktTmmUh/h2t7s+M8oPpIPxgNACWa+6ESR57kXstwqPiCut0V8NRpcwgU7A== + dependencies: + domelementtype "^2.0.1" + domhandler "^4.0.0" + domutils "^2.5.2" + entities "^2.0.0" + +http-deceiver@^1.2.7: + version "1.2.7" + resolved "https://registry.yarnpkg.com/http-deceiver/-/http-deceiver-1.2.7.tgz#fa7168944ab9a519d337cb0bec7284dc3e723d87" + integrity sha1-+nFolEq5pRnTN8sL7HKE3D5yPYc= + +http-errors@1.8.1: + version "1.8.1" + resolved "https://registry.yarnpkg.com/http-errors/-/http-errors-1.8.1.tgz#7c3f28577cbc8a207388455dbd62295ed07bd68c" + integrity sha512-Kpk9Sm7NmI+RHhnj6OIWDI1d6fIoFAtFt9RLaTMRlg/8w49juAStsrBgp0Dp4OdxdVbRIeKhtCUvoi/RuAhO4g== + dependencies: + depd "~1.1.2" + inherits "2.0.4" + setprototypeof "1.2.0" + statuses ">= 1.5.0 < 2" + toidentifier "1.0.1" + +http-errors@~1.6.2: + version "1.6.3" + resolved "https://registry.yarnpkg.com/http-errors/-/http-errors-1.6.3.tgz#8b55680bb4be283a0b5bf4ea2e38580be1d9320d" + integrity sha1-i1VoC7S+KDoLW/TqLjhYC+HZMg0= + dependencies: + depd "~1.1.2" + inherits "2.0.3" + setprototypeof "1.1.0" + statuses ">= 1.4.0 < 2" + +http-parser-js@>=0.5.1: + version "0.5.6" + resolved "https://registry.yarnpkg.com/http-parser-js/-/http-parser-js-0.5.6.tgz#2e02406ab2df8af8a7abfba62e0da01c62b95afd" + integrity sha512-vDlkRPDJn93swjcjqMSaGSPABbIarsr1TLAui/gLDXzV5VsJNdXNzMYDyNBLQkjWQCJ1uizu8T2oDMhmGt0PRA== + +http-proxy-middleware@^2.0.0: + version "2.0.3" + resolved "https://registry.yarnpkg.com/http-proxy-middleware/-/http-proxy-middleware-2.0.3.tgz#5df04f69a89f530c2284cd71eeaa51ba52243289" + integrity sha512-1bloEwnrHMnCoO/Gcwbz7eSVvW50KPES01PecpagI+YLNLci4AcuKJrujW4Mc3sBLpFxMSlsLNHS5Nl/lvrTPA== + dependencies: + "@types/http-proxy" "^1.17.8" + http-proxy "^1.18.1" + is-glob "^4.0.1" + is-plain-obj "^3.0.0" + micromatch "^4.0.2" + +http-proxy@^1.18.1: + version "1.18.1" + resolved "https://registry.yarnpkg.com/http-proxy/-/http-proxy-1.18.1.tgz#401541f0534884bbf95260334e72f88ee3976549" + integrity sha512-7mz/721AbnJwIVbnaSv1Cz3Am0ZLT/UBwkC92VlxhXv/k/BBQfM2fXElQNC27BVGr0uwUpplYPQM9LnaBMR5NQ== + dependencies: + eventemitter3 "^4.0.0" + follow-redirects "^1.0.0" + requires-port "^1.0.0" + +human-signals@^2.1.0: + version "2.1.0" + resolved "https://registry.yarnpkg.com/human-signals/-/human-signals-2.1.0.tgz#dc91fcba42e4d06e4abaed33b3e7a3c02f514ea0" + integrity sha512-B4FFZ6q/T2jhhksgkbEW3HBvWIfDW85snkQgawt07S7J5QXTk6BkNV+0yAeZrM5QpMAdYlocGoljn0sJ/WQkFw== + +hyphenate-style-name@^1.0.3: + version "1.0.4" + resolved "https://registry.yarnpkg.com/hyphenate-style-name/-/hyphenate-style-name-1.0.4.tgz#691879af8e220aea5750e8827db4ef62a54e361d" + integrity sha512-ygGZLjmXfPHj+ZWh6LwbC37l43MhfztxetbFCoYTM2VjkIUpeHgSNn7QIyVFj7YQ1Wl9Cbw5sholVJPzWvC2MQ== + +iconv-lite@0.4.24: + version "0.4.24" + resolved "https://registry.yarnpkg.com/iconv-lite/-/iconv-lite-0.4.24.tgz#2022b4b25fbddc21d2f524974a474aafe733908b" + integrity sha512-v3MXnZAcvnywkTUEZomIActle7RXXeedOR31wwl7VlyoXO4Qi9arvSenNQWne1TcRwhCL1HwLI21bEqdpj8/rA== + dependencies: + safer-buffer ">= 2.1.2 < 3" + +icss-utils@^5.0.0, icss-utils@^5.1.0: + version "5.1.0" + resolved "https://registry.yarnpkg.com/icss-utils/-/icss-utils-5.1.0.tgz#c6be6858abd013d768e98366ae47e25d5887b1ae" + integrity sha512-soFhflCVWLfRNOPU3iv5Z9VUdT44xFRbzjLsEzSr5AQmgqPMTHdU3PMT1Cf1ssx8fLNJDA1juftYl+PUcv3MqA== + +ignore@^5.2.0: + version "5.2.0" + resolved "https://registry.yarnpkg.com/ignore/-/ignore-5.2.0.tgz#6d3bac8fa7fe0d45d9f9be7bac2fc279577e345a" + integrity sha512-CmxgYGiEPCLhfLnpPp1MoRmifwEIOgjcHXxOBjv7mY96c+eWScsOP9c112ZyLdWHi0FxHjI+4uVhKYp/gcdRmQ== + +import-local@^3.0.2: + version "3.1.0" + resolved "https://registry.yarnpkg.com/import-local/-/import-local-3.1.0.tgz#b4479df8a5fd44f6cdce24070675676063c95cb4" + integrity sha512-ASB07uLtnDs1o6EHjKpX34BKYDSqnFerfTOJL2HvMqF70LnxpjkzDB8J44oT9pu4AMPkQwf8jl6szgvNd2tRIg== + dependencies: + pkg-dir "^4.2.0" + resolve-cwd "^3.0.0" + +indent-string@^4.0.0: + version "4.0.0" + resolved "https://registry.yarnpkg.com/indent-string/-/indent-string-4.0.0.tgz#624f8f4497d619b2d9768531d58f4122854d7251" + integrity sha512-EdDDZu4A2OyIK7Lr/2zG+w5jmbuk1DVBnEwREQvBzspBJkCEbRa8GxU1lghYcaGJCnRWibjDXlq779X1/y5xwg== + +inflight@^1.0.4: + version "1.0.6" + resolved "https://registry.yarnpkg.com/inflight/-/inflight-1.0.6.tgz#49bd6331d7d02d0c09bc910a1075ba8165b56df9" + integrity sha1-Sb1jMdfQLQwJvJEKEHW6gWW1bfk= + dependencies: + once "^1.3.0" + wrappy "1" + +inherits@2, inherits@2.0.4, inherits@^2.0.1, inherits@^2.0.3, inherits@~2.0.3: + version "2.0.4" + resolved "https://registry.yarnpkg.com/inherits/-/inherits-2.0.4.tgz#0fa2c64f932917c3433a0ded55363aae37416b7c" + integrity sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ== + +inherits@2.0.3: + version "2.0.3" + resolved "https://registry.yarnpkg.com/inherits/-/inherits-2.0.3.tgz#633c2c83e3da42a502f52466022480f4208261de" + integrity sha1-Yzwsg+PaQqUC9SRmAiSA9CCCYd4= + +inline-chunk-html-plugin@^1.1.1: + version "1.1.1" + resolved "https://registry.yarnpkg.com/inline-chunk-html-plugin/-/inline-chunk-html-plugin-1.1.1.tgz#f64111aed16fac274d2b929f6a6a08671d82354e" + integrity sha512-6W1eGIj8z/Yla6xJx5il6jJfCxMZS3kVkbiLQThbbjdsDLRIWkUVmpnhfW2l6WAwCW+qfy0zoXVGBZM1E5XF3g== + +interpret@^2.2.0: + version "2.2.0" + resolved "https://registry.yarnpkg.com/interpret/-/interpret-2.2.0.tgz#1a78a0b5965c40a5416d007ad6f50ad27c417df9" + integrity sha512-Ju0Bz/cEia55xDwUWEa8+olFpCiQoypjnQySseKtmjNrnps3P+xfpUmGr90T7yjlVJmOtybRvPXhKMbHr+fWnw== + +ip@^1.1.0: + version "1.1.5" + resolved "https://registry.yarnpkg.com/ip/-/ip-1.1.5.tgz#bdded70114290828c0a039e72ef25f5aaec4354a" + integrity sha1-vd7XARQpCCjAoDnnLvJfWq7ENUo= + +ipaddr.js@1.9.1: + version "1.9.1" + resolved "https://registry.yarnpkg.com/ipaddr.js/-/ipaddr.js-1.9.1.tgz#bff38543eeb8984825079ff3a2a8e6cbd46781b3" + integrity sha512-0KI/607xoxSToH7GjN1FfSbLoU0+btTicjsQSWQlh/hZykN8KpmMf7uYwPW3R+akZ6R/w18ZlXSHBYXiYUPO3g== + +ipaddr.js@^2.0.1: + version "2.0.1" + resolved "https://registry.yarnpkg.com/ipaddr.js/-/ipaddr.js-2.0.1.tgz#eca256a7a877e917aeb368b0a7497ddf42ef81c0" + integrity sha512-1qTgH9NG+IIJ4yfKs2e6Pp1bZg8wbDbKHT21HrLIeYBTRLgMYKnMTPAuI3Lcs61nfx5h1xlXnbJtH1kX5/d/ng== + +is-arguments@^1.0.4: + version "1.1.1" + resolved "https://registry.yarnpkg.com/is-arguments/-/is-arguments-1.1.1.tgz#15b3f88fda01f2a97fec84ca761a560f123efa9b" + integrity sha512-8Q7EARjzEnKpt/PCD7e1cgUS0a6X8u5tdSiMqXhojOdoV9TsMsiO+9VLC5vAmO8N7/GmXn7yjR8qnA6bVAEzfA== + dependencies: + call-bind "^1.0.2" + has-tostringtag "^1.0.0" + +is-binary-path@~2.1.0: + version "2.1.0" + resolved "https://registry.yarnpkg.com/is-binary-path/-/is-binary-path-2.1.0.tgz#ea1f7f3b80f064236e83470f86c09c254fb45b09" + integrity sha512-ZMERYes6pDydyuGidse7OsHxtbI7WVeUEozgR/g7rd0xUimYNlvZRE/K2MgZTjWy725IfelLeVcEM97mmtRGXw== + dependencies: + binary-extensions "^2.0.0" + +is-core-module@^2.8.1: + version "2.8.1" + resolved "https://registry.yarnpkg.com/is-core-module/-/is-core-module-2.8.1.tgz#f59fdfca701d5879d0a6b100a40aa1560ce27211" + integrity sha512-SdNCUs284hr40hFTFP6l0IfZ/RSrMXF3qgoRHd3/79unUTvrFO/JoXwkGm+5J/Oe3E/b5GsnG330uUNgRpu1PA== + dependencies: + has "^1.0.3" + +is-date-object@^1.0.1: + version "1.0.5" + resolved "https://registry.yarnpkg.com/is-date-object/-/is-date-object-1.0.5.tgz#0841d5536e724c25597bf6ea62e1bd38298df31f" + integrity sha512-9YQaSxsAiSwcvS33MBk3wTCVnWK+HhF8VZR2jRxehM16QcVOdHqPn4VPHmRK4lSr38n9JriurInLcP90xsYNfQ== + dependencies: + has-tostringtag "^1.0.0" + +is-docker@^2.0.0, is-docker@^2.1.1: + version "2.2.1" + resolved "https://registry.yarnpkg.com/is-docker/-/is-docker-2.2.1.tgz#33eeabe23cfe86f14bde4408a02c0cfb853acdaa" + integrity sha512-F+i2BKsFrH66iaUFc0woD8sLy8getkwTwtOBjvs56Cx4CgJDeKQeqfz8wAYiSb8JOprWhHH5p77PbmYCvvUuXQ== + +is-extglob@^2.1.1: + version "2.1.1" + resolved "https://registry.yarnpkg.com/is-extglob/-/is-extglob-2.1.1.tgz#a88c02535791f02ed37c76a1b9ea9773c833f8c2" + integrity sha1-qIwCU1eR8C7TfHahueqXc8gz+MI= + +is-glob@^4.0.1, is-glob@~4.0.1: + version "4.0.3" + resolved "https://registry.yarnpkg.com/is-glob/-/is-glob-4.0.3.tgz#64f61e42cbbb2eec2071a9dac0b28ba1e65d5084" + integrity sha512-xelSayHH36ZgE7ZWhli7pW34hNbNl8Ojv5KVmkJD4hBdD3th8Tfk9vYasLM+mXWOZhFkgZfxhLSnrwRr4elSSg== + dependencies: + is-extglob "^2.1.1" + +is-in-browser@^1.0.2, is-in-browser@^1.1.3: + version "1.1.3" + resolved "https://registry.yarnpkg.com/is-in-browser/-/is-in-browser-1.1.3.tgz#56ff4db683a078c6082eb95dad7dc62e1d04f835" + integrity sha1-Vv9NtoOgeMYILrldrX3GLh0E+DU= + +is-number@^7.0.0: + version "7.0.0" + resolved "https://registry.yarnpkg.com/is-number/-/is-number-7.0.0.tgz#7535345b896734d5f80c4d06c50955527a14f12b" + integrity sha512-41Cifkg6e8TylSpdtTpeLVMqvSBEVzTttHvERD741+pnZ8ANv0004MRL43QKPDlK9cGvNp6NZWZUBlbGXYxxng== + +is-path-cwd@^2.2.0: + version "2.2.0" + resolved "https://registry.yarnpkg.com/is-path-cwd/-/is-path-cwd-2.2.0.tgz#67d43b82664a7b5191fd9119127eb300048a9fdb" + integrity sha512-w942bTcih8fdJPJmQHFzkS76NEP8Kzzvmw92cXsazb8intwLqPibPPdXf4ANdKV3rYMuuQYGIWtvz9JilB3NFQ== + +is-path-inside@^3.0.2: + version "3.0.3" + resolved "https://registry.yarnpkg.com/is-path-inside/-/is-path-inside-3.0.3.tgz#d231362e53a07ff2b0e0ea7fed049161ffd16283" + integrity sha512-Fd4gABb+ycGAmKou8eMftCupSir5lRxqf4aD/vd0cD2qc4HL07OjCeuHMr8Ro4CoMaeCKDB0/ECBOVWjTwUvPQ== + +is-plain-obj@^3.0.0: + version "3.0.0" + resolved "https://registry.yarnpkg.com/is-plain-obj/-/is-plain-obj-3.0.0.tgz#af6f2ea14ac5a646183a5bbdb5baabbc156ad9d7" + integrity sha512-gwsOE28k+23GP1B6vFl1oVh/WOzmawBrKwo5Ev6wMKzPkaXaCDIQKzLnvsA42DRlbVTWorkgTKIviAKCWkfUwA== + +is-plain-object@^2.0.4: + version "2.0.4" + resolved "https://registry.yarnpkg.com/is-plain-object/-/is-plain-object-2.0.4.tgz#2c163b3fafb1b606d9d17928f05c2a1c38e07677" + integrity sha512-h5PpgXkWitc38BBMYawTYMWJHFZJVnBquFE57xFpjB8pJFiF6gZ+bU+WyI/yqXiFR5mdLsgYNaPe8uao6Uv9Og== + dependencies: + isobject "^3.0.1" + +is-regex@^1.0.4: + version "1.1.4" + resolved "https://registry.yarnpkg.com/is-regex/-/is-regex-1.1.4.tgz#eef5663cd59fa4c0ae339505323df6854bb15958" + integrity sha512-kvRdxDsxZjhzUX07ZnLydzS1TU/TJlTUHHY4YLL87e37oUA49DfkLqgy+VjFocowy29cKvcSiu+kIv728jTTVg== + dependencies: + call-bind "^1.0.2" + has-tostringtag "^1.0.0" + +is-stream@^2.0.0: + version "2.0.1" + resolved "https://registry.yarnpkg.com/is-stream/-/is-stream-2.0.1.tgz#fac1e3d53b97ad5a9d0ae9cef2389f5810a5c077" + integrity sha512-hFoiJiTl63nn+kstHGBtewWSKnQLpyb155KHheA1l39uvtO9nWIop1p3udqPcUd/xbF1VLMO4n7OI6p7RbngDg== + +is-wsl@^2.2.0: + version "2.2.0" + resolved "https://registry.yarnpkg.com/is-wsl/-/is-wsl-2.2.0.tgz#74a4c76e77ca9fd3f932f290c17ea326cd157271" + integrity sha512-fKzAra0rGJUUBwGBgNkHZuToZcn+TtXHpeCgmkMJMMYx1sQDYaCSyjJBSCa2nH1DGm7s3n1oBnohoVTBaN7Lww== + dependencies: + is-docker "^2.0.0" + +isarray@~1.0.0: + version "1.0.0" + resolved "https://registry.yarnpkg.com/isarray/-/isarray-1.0.0.tgz#bb935d48582cba168c06834957a54a3e07124f11" + integrity sha1-u5NdSFgsuhaMBoNJV6VKPgcSTxE= + +isexe@^2.0.0: + version "2.0.0" + resolved "https://registry.yarnpkg.com/isexe/-/isexe-2.0.0.tgz#e8fbf374dc556ff8947a10dcb0572d633f2cfa10" + integrity sha1-6PvzdNxVb/iUehDcsFctYz8s+hA= + +isobject@^3.0.1: + version "3.0.1" + resolved "https://registry.yarnpkg.com/isobject/-/isobject-3.0.1.tgz#4e431e92b11a9731636aa1f9c8d1ccbcfdab78df" + integrity sha1-TkMekrEalzFjaqH5yNHMvP2reN8= + +jest-worker@^27.4.5: + version "27.5.1" + resolved "https://registry.yarnpkg.com/jest-worker/-/jest-worker-27.5.1.tgz#8d146f0900e8973b106b6f73cc1e9a8cb86f8db0" + integrity sha512-7vuh85V5cdDofPyxn58nrPjBktZo0u9x1g8WtjQol+jZDaE+fhN+cIvTj11GndBnMnyfrUOG1sZQxCdjKh+DKg== + dependencies: + "@types/node" "*" + merge-stream "^2.0.0" + supports-color "^8.0.0" + +"js-tokens@^3.0.0 || ^4.0.0": + version "4.0.0" + resolved "https://registry.yarnpkg.com/js-tokens/-/js-tokens-4.0.0.tgz#19203fb59991df98e3a287050d4647cdeaf32499" + integrity sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ== + +json-parse-better-errors@^1.0.2: + version "1.0.2" + resolved "https://registry.yarnpkg.com/json-parse-better-errors/-/json-parse-better-errors-1.0.2.tgz#bb867cfb3450e69107c131d1c514bab3dc8bcaa9" + integrity sha512-mrqyZKfX5EhL7hvqcV6WG1yYjnjeuYDzDhhcAAUrq8Po85NBQBJP+ZDUT75qZQ98IkUoBqdkExkukOU7Ts2wrw== + +json-schema-traverse@^0.4.1: + version "0.4.1" + resolved "https://registry.yarnpkg.com/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz#69f6a87d9513ab8bb8fe63bdb0979c448e684660" + integrity sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg== + +json-schema-traverse@^1.0.0: + version "1.0.0" + resolved "https://registry.yarnpkg.com/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz#ae7bcb3656ab77a73ba5c49bf654f38e6b6860e2" + integrity sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug== + +json2mq@^0.2.0: + version "0.2.0" + resolved "https://registry.yarnpkg.com/json2mq/-/json2mq-0.2.0.tgz#b637bd3ba9eabe122c83e9720483aeb10d2c904a" + integrity sha1-tje9O6nqvhIsg+lyBIOusQ0skEo= + dependencies: + string-convert "^0.2.0" + +json5@^2.1.2: + version "2.2.0" + resolved "https://registry.yarnpkg.com/json5/-/json5-2.2.0.tgz#2dfefe720c6ba525d9ebd909950f0515316c89a3" + integrity sha512-f+8cldu7X/y7RAJurMEJmdoKXGB/X550w2Nr3tTbezL6RwEE/iMcm+tZnXeoZtKuOq6ft8+CqzEkrIgx1fPoQA== + dependencies: + minimist "^1.2.5" + +jss-plugin-camel-case@^10.5.1: + version "10.9.0" + resolved "https://registry.yarnpkg.com/jss-plugin-camel-case/-/jss-plugin-camel-case-10.9.0.tgz#4921b568b38d893f39736ee8c4c5f1c64670aaf7" + integrity sha512-UH6uPpnDk413/r/2Olmw4+y54yEF2lRIV8XIZyuYpgPYTITLlPOsq6XB9qeqv+75SQSg3KLocq5jUBXW8qWWww== + dependencies: + "@babel/runtime" "^7.3.1" + hyphenate-style-name "^1.0.3" + jss "10.9.0" + +jss-plugin-default-unit@^10.5.1: + version "10.9.0" + resolved "https://registry.yarnpkg.com/jss-plugin-default-unit/-/jss-plugin-default-unit-10.9.0.tgz#bb23a48f075bc0ce852b4b4d3f7582bc002df991" + integrity sha512-7Ju4Q9wJ/MZPsxfu4T84mzdn7pLHWeqoGd/D8O3eDNNJ93Xc8PxnLmV8s8ZPNRYkLdxZqKtm1nPQ0BM4JRlq2w== + dependencies: + "@babel/runtime" "^7.3.1" + jss "10.9.0" + +jss-plugin-global@^10.5.1: + version "10.9.0" + resolved "https://registry.yarnpkg.com/jss-plugin-global/-/jss-plugin-global-10.9.0.tgz#fc07a0086ac97aca174e37edb480b69277f3931f" + integrity sha512-4G8PHNJ0x6nwAFsEzcuVDiBlyMsj2y3VjmFAx/uHk/R/gzJV+yRHICjT4MKGGu1cJq2hfowFWCyrr/Gg37FbgQ== + dependencies: + "@babel/runtime" "^7.3.1" + jss "10.9.0" + +jss-plugin-nested@^10.5.1: + version "10.9.0" + resolved "https://registry.yarnpkg.com/jss-plugin-nested/-/jss-plugin-nested-10.9.0.tgz#cc1c7d63ad542c3ccc6e2c66c8328c6b6b00f4b3" + integrity sha512-2UJnDrfCZpMYcpPYR16oZB7VAC6b/1QLsRiAutOt7wJaaqwCBvNsosLEu/fUyKNQNGdvg2PPJFDO5AX7dwxtoA== + dependencies: + "@babel/runtime" "^7.3.1" + jss "10.9.0" + tiny-warning "^1.0.2" + +jss-plugin-props-sort@^10.5.1: + version "10.9.0" + resolved "https://registry.yarnpkg.com/jss-plugin-props-sort/-/jss-plugin-props-sort-10.9.0.tgz#30e9567ef9479043feb6e5e59db09b4de687c47d" + integrity sha512-7A76HI8bzwqrsMOJTWKx/uD5v+U8piLnp5bvru7g/3ZEQOu1+PjHvv7bFdNO3DwNPC9oM0a//KwIJsIcDCjDzw== + dependencies: + "@babel/runtime" "^7.3.1" + jss "10.9.0" + +jss-plugin-rule-value-function@^10.5.1: + version "10.9.0" + resolved "https://registry.yarnpkg.com/jss-plugin-rule-value-function/-/jss-plugin-rule-value-function-10.9.0.tgz#379fd2732c0746fe45168011fe25544c1a295d67" + integrity sha512-IHJv6YrEf8pRzkY207cPmdbBstBaE+z8pazhPShfz0tZSDtRdQua5jjg6NMz3IbTasVx9FdnmptxPqSWL5tyJg== + dependencies: + "@babel/runtime" "^7.3.1" + jss "10.9.0" + tiny-warning "^1.0.2" + +jss-plugin-vendor-prefixer@^10.5.1: + version "10.9.0" + resolved "https://registry.yarnpkg.com/jss-plugin-vendor-prefixer/-/jss-plugin-vendor-prefixer-10.9.0.tgz#aa9df98abfb3f75f7ed59a3ec50a5452461a206a" + integrity sha512-MbvsaXP7iiVdYVSEoi+blrW+AYnTDvHTW6I6zqi7JcwXdc6I9Kbm234nEblayhF38EftoenbM+5218pidmC5gA== + dependencies: + "@babel/runtime" "^7.3.1" + css-vendor "^2.0.8" + jss "10.9.0" + +jss@10.9.0, jss@^10.5.1: + version "10.9.0" + resolved "https://registry.yarnpkg.com/jss/-/jss-10.9.0.tgz#7583ee2cdc904a83c872ba695d1baab4b59c141b" + integrity sha512-YpzpreB6kUunQBbrlArlsMpXYyndt9JATbt95tajx0t4MTJJcCJdd4hdNpHmOIDiUJrF/oX5wtVFrS3uofWfGw== + dependencies: + "@babel/runtime" "^7.3.1" + csstype "^3.0.2" + is-in-browser "^1.1.3" + tiny-warning "^1.0.2" + +kind-of@^6.0.2: + version "6.0.3" + resolved "https://registry.yarnpkg.com/kind-of/-/kind-of-6.0.3.tgz#07c05034a6c349fa06e24fa35aa76db4580ce4dd" + integrity sha512-dcS1ul+9tmeD95T+x28/ehLgd9mENa3LsvDTtzm3vyBEO7RPptvAD+t44WVXaUjTBRcrpFeFlC8WCruUR456hw== + +loader-runner@^4.2.0: + version "4.2.0" + resolved "https://registry.yarnpkg.com/loader-runner/-/loader-runner-4.2.0.tgz#d7022380d66d14c5fb1d496b89864ebcfd478384" + integrity sha512-92+huvxMvYlMzMt0iIOukcwYBFpkYJdpl2xsZ7LrlayO7E8SOv+JJUEK17B/dJIHAOLMfh2dZZ/Y18WgmGtYNw== + +loader-utils@^2.0.0: + version "2.0.2" + resolved "https://registry.yarnpkg.com/loader-utils/-/loader-utils-2.0.2.tgz#d6e3b4fb81870721ae4e0868ab11dd638368c129" + integrity sha512-TM57VeHptv569d/GKh6TAYdzKblwDNiumOdkFnejjD0XwTH87K90w3O7AiJRqdQoXygvi1VQTJTLGhJl7WqA7A== + dependencies: + big.js "^5.2.2" + emojis-list "^3.0.0" + json5 "^2.1.2" + +locate-path@^5.0.0: + version "5.0.0" + resolved "https://registry.yarnpkg.com/locate-path/-/locate-path-5.0.0.tgz#1afba396afd676a6d42504d0a67a3a7eb9f62aa0" + integrity sha512-t7hw9pI+WvuwNJXwk5zVHpyhIqzg2qTlklJOf0mVxGSbe3Fp2VieZcduNYjaLDoy6p9uGpQEGWG87WpMKlNq8g== + dependencies: + p-locate "^4.1.0" + +lodash@^4.17.14, lodash@^4.17.20, lodash@^4.17.21: + version "4.17.21" + resolved "https://registry.yarnpkg.com/lodash/-/lodash-4.17.21.tgz#679591c564c3bffaae8454cf0b3df370c3d6911c" + integrity sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg== + +loose-envify@^1.1.0, loose-envify@^1.4.0: + version "1.4.0" + resolved "https://registry.yarnpkg.com/loose-envify/-/loose-envify-1.4.0.tgz#71ee51fa7be4caec1a63839f7e682d8132d30caf" + integrity sha512-lyuxPGr/Wfhrlem2CL/UcnUc1zcqKAImBDzukY7Y5F/yQiNdko6+fRLevlw1HgMySw7f611UIY408EtxRSoK3Q== + dependencies: + js-tokens "^3.0.0 || ^4.0.0" + +lower-case@^2.0.2: + version "2.0.2" + resolved "https://registry.yarnpkg.com/lower-case/-/lower-case-2.0.2.tgz#6fa237c63dbdc4a82ca0fd882e4722dc5e634e28" + integrity sha512-7fm3l3NAF9WfN6W3JOmf5drwpVqX78JtoGJ3A6W0a6ZnldM41w2fV5D490psKFTpMds8TJse/eHLFFsNHHjHgg== + dependencies: + tslib "^2.0.3" + +lru-cache@^6.0.0: + version "6.0.0" + resolved "https://registry.yarnpkg.com/lru-cache/-/lru-cache-6.0.0.tgz#6d6fe6570ebd96aaf90fcad1dafa3b2566db3a94" + integrity sha512-Jo6dJ04CmSjuznwJSS3pUeWmd/H0ffTlkXXgwZi+eq1UCmqQwCh+eLsYOYCwY991i2Fah4h1BEMCx4qThGbsiA== + dependencies: + yallist "^4.0.0" + +media-typer@0.3.0: + version "0.3.0" + resolved "https://registry.yarnpkg.com/media-typer/-/media-typer-0.3.0.tgz#8710d7af0aa626f8fffa1ce00168545263255748" + integrity sha1-hxDXrwqmJvj/+hzgAWhUUmMlV0g= + +memfs@^3.4.1: + version "3.4.1" + resolved "https://registry.yarnpkg.com/memfs/-/memfs-3.4.1.tgz#b78092f466a0dce054d63d39275b24c71d3f1305" + integrity sha512-1c9VPVvW5P7I85c35zAdEr1TD5+F11IToIHIlrVIcflfnzPkJa0ZoYEoEdYDP8KgPFoSZ/opDrUsAoZWym3mtw== + dependencies: + fs-monkey "1.0.3" + +"memoize-one@>=3.1.1 <6": + version "5.2.1" + resolved "https://registry.yarnpkg.com/memoize-one/-/memoize-one-5.2.1.tgz#8337aa3c4335581839ec01c3d594090cebe8f00e" + integrity sha512-zYiwtZUcYyXKo/np96AGZAckk+FWWsUdJ3cHGGmld7+AhvcWmQyGCYUh1hc4Q/pkOhb65dQR/pqCyK0cOaHz4Q== + +memoize-one@^3.1.1: + version "3.1.1" + resolved "https://registry.yarnpkg.com/memoize-one/-/memoize-one-3.1.1.tgz#ef609811e3bc28970eac2884eece64d167830d17" + integrity sha512-YqVh744GsMlZu6xkhGslPSqSurOv6P+kLN2J3ysBZfagLcL5FdRK/0UpgLoL8hwjjEvvAVkjJZyFP+1T6p1vgA== + +memoize-one@^6.0.0: + version "6.0.0" + resolved "https://registry.yarnpkg.com/memoize-one/-/memoize-one-6.0.0.tgz#b2591b871ed82948aee4727dc6abceeeac8c1045" + integrity sha512-rkpe71W0N0c0Xz6QD0eJETuWAJGnJ9afsl1srmwPrI+yBCkge5EycXXbYRyvL29zZVUWQCY7InPRCv3GDXuZNw== + +memory-fs@^0.5.0: + version "0.5.0" + resolved "https://registry.yarnpkg.com/memory-fs/-/memory-fs-0.5.0.tgz#324c01288b88652966d161db77838720845a8e3c" + integrity sha512-jA0rdU5KoQMC0e6ppoNRtpp6vjFq6+NY7r8hywnC7V+1Xj/MtHwGIbB1QaK/dunyjWteJzmkpd7ooeWg10T7GA== + dependencies: + errno "^0.1.3" + readable-stream "^2.0.1" + +merge-descriptors@1.0.1: + version "1.0.1" + resolved "https://registry.yarnpkg.com/merge-descriptors/-/merge-descriptors-1.0.1.tgz#b00aaa556dd8b44568150ec9d1b953f3f90cbb61" + integrity sha1-sAqqVW3YtEVoFQ7J0blT8/kMu2E= + +merge-stream@^2.0.0: + version "2.0.0" + resolved "https://registry.yarnpkg.com/merge-stream/-/merge-stream-2.0.0.tgz#52823629a14dd00c9770fb6ad47dc6310f2c1f60" + integrity sha512-abv/qOcuPfk3URPfDzmZU1LKmuw8kT+0nIHvKrKgFrwifol/doWcdA4ZqsWQ8ENrFKkd67Mfpo/LovbIUsbt3w== + +merge2@^1.3.0, merge2@^1.4.1: + version "1.4.1" + resolved "https://registry.yarnpkg.com/merge2/-/merge2-1.4.1.tgz#4368892f885e907455a6fd7dc55c0c9d404990ae" + integrity sha512-8q7VEgMJW4J8tcfVPy8g09NcQwZdbwFEqhe/WZkoIzjn/3TGDwtOCYtXGxA3O8tPzpczCCDgv+P2P5y00ZJOOg== + +methods@~1.1.2: + version "1.1.2" + resolved "https://registry.yarnpkg.com/methods/-/methods-1.1.2.tgz#5529a4d67654134edcc5266656835b0f851afcee" + integrity sha1-VSmk1nZUE07cxSZmVoNbD4Ua/O4= + +micromatch@^4.0.0, micromatch@^4.0.2, micromatch@^4.0.4: + version "4.0.4" + resolved "https://registry.yarnpkg.com/micromatch/-/micromatch-4.0.4.tgz#896d519dfe9db25fce94ceb7a500919bf881ebf9" + integrity sha512-pRmzw/XUcwXGpD9aI9q/0XOwLNygjETJ8y0ao0wdqprrzDa4YnxLcz7fQRZr8voh8V10kGhABbNcHVk5wHgWwg== + dependencies: + braces "^3.0.1" + picomatch "^2.2.3" + +mime-db@1.51.0: + version "1.51.0" + resolved "https://registry.yarnpkg.com/mime-db/-/mime-db-1.51.0.tgz#d9ff62451859b18342d960850dc3cfb77e63fb0c" + integrity sha512-5y8A56jg7XVQx2mbv1lu49NR4dokRnhZYTtL+KGfaa27uq4pSTXkwQkFJl4pkRMyNFz/EtYDSkiiEHx3F7UN6g== + +"mime-db@>= 1.43.0 < 2": + version "1.52.0" + resolved "https://registry.yarnpkg.com/mime-db/-/mime-db-1.52.0.tgz#bbabcdc02859f4987301c856e3387ce5ec43bf70" + integrity sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg== + +mime-types@^2.1.27, mime-types@^2.1.31, mime-types@~2.1.17, mime-types@~2.1.24, mime-types@~2.1.34: + version "2.1.34" + resolved "https://registry.yarnpkg.com/mime-types/-/mime-types-2.1.34.tgz#5a712f9ec1503511a945803640fafe09d3793c24" + integrity sha512-6cP692WwGIs9XXdOO4++N+7qjqv0rqxxVvJ3VHPh/Sc9mVZcQP+ZGhkKiTvWMQRr2tbHkJP/Yn7Y0npb3ZBs4A== + dependencies: + mime-db "1.51.0" + +mime@1.6.0: + version "1.6.0" + resolved "https://registry.yarnpkg.com/mime/-/mime-1.6.0.tgz#32cd9e5c64553bd58d19a568af452acff04981b1" + integrity sha512-x0Vn8spI+wuJ1O6S7gnbaQg8Pxh4NNHb7KSINmEWKiPE4RKOplvijn+NkmYmmRgP68mc70j2EbeTFRsrswaQeg== + +mimic-fn@^2.1.0: + version "2.1.0" + resolved "https://registry.yarnpkg.com/mimic-fn/-/mimic-fn-2.1.0.tgz#7ed2c2ccccaf84d3ffcb7a69b57711fc2083401b" + integrity sha512-OqbOk5oEQeAZ8WXWydlu9HJjz9WVdEIvamMCcXmuqUYjTknH/sqsWvhQ3vgwKFRR1HpjvNBKQ37nbJgYzGqGcg== + +minimalistic-assert@^1.0.0: + version "1.0.1" + resolved "https://registry.yarnpkg.com/minimalistic-assert/-/minimalistic-assert-1.0.1.tgz#2e194de044626d4a10e7f7fbc00ce73e83e4d5c7" + integrity sha512-UtJcAD4yEaGtjPezWuO9wC4nwUnVH/8/Im3yEHQP4b67cXlD/Qr9hdITCU1xDbSEXg2XKNaP8jsReV7vQd00/A== + +minimatch@^3.0.4: + version "3.1.2" + resolved "https://registry.yarnpkg.com/minimatch/-/minimatch-3.1.2.tgz#19cd194bfd3e428f049a70817c038d89ab4be35b" + integrity sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw== + dependencies: + brace-expansion "^1.1.7" + +minimist@^1.2.5: + version "1.2.5" + resolved "https://registry.yarnpkg.com/minimist/-/minimist-1.2.5.tgz#67d66014b66a6a8aaa0c083c5fd58df4e4e97602" + integrity sha512-FM9nNUYrRBAELZQT3xeZQ7fmMOBg6nWNmJKTcgsJeaLstP/UODVpGsr5OhXhhXg6f+qtJ8uiZ+PUxkDWcgIXLw== + +mkdirp@^0.5.5: + version "0.5.5" + resolved "https://registry.yarnpkg.com/mkdirp/-/mkdirp-0.5.5.tgz#d91cefd62d1436ca0f41620e251288d420099def" + integrity sha512-NKmAlESf6jMGym1++R0Ra7wvhV+wFW63FaSOFPwRahvea0gMUcGUhVeAg/0BC0wiv9ih5NYPB1Wn1UEI1/L+xQ== + dependencies: + minimist "^1.2.5" + +moment@^2.24.0, moment@^2.25.3: + version "2.29.1" + resolved "https://registry.yarnpkg.com/moment/-/moment-2.29.1.tgz#b2be769fa31940be9eeea6469c075e35006fa3d3" + integrity sha512-kHmoybcPV8Sqy59DwNDY3Jefr64lK/by/da0ViFcuA4DH0vQg5Q6Ze5VimxkfQNSC+Mls/Kx53s7TjP1RhFEDQ== + +ms@2.0.0: + version "2.0.0" + resolved "https://registry.yarnpkg.com/ms/-/ms-2.0.0.tgz#5608aeadfc00be6c2901df5f9861788de0d597c8" + integrity sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g= + +ms@2.1.2: + version "2.1.2" + resolved "https://registry.yarnpkg.com/ms/-/ms-2.1.2.tgz#d09d1f357b443f493382a8eb3ccd183872ae6009" + integrity sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w== + +ms@2.1.3, ms@^2.1.1: + version "2.1.3" + resolved "https://registry.yarnpkg.com/ms/-/ms-2.1.3.tgz#574c8138ce1d2b5861f0b44579dbadd60c6615b2" + integrity sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA== + +multicast-dns-service-types@^1.1.0: + version "1.1.0" + resolved "https://registry.yarnpkg.com/multicast-dns-service-types/-/multicast-dns-service-types-1.1.0.tgz#899f11d9686e5e05cb91b35d5f0e63b773cfc901" + integrity sha1-iZ8R2WhuXgXLkbNdXw5jt3PPyQE= + +multicast-dns@^6.0.1: + version "6.2.3" + resolved "https://registry.yarnpkg.com/multicast-dns/-/multicast-dns-6.2.3.tgz#a0ec7bd9055c4282f790c3c82f4e28db3b31b229" + integrity sha512-ji6J5enbMyGRHIAkAOu3WdV8nggqviKCEKtXcOqfphZZtQrmHKycfynJ2V7eVPUA4NhJ6V7Wf4TmGbTwKE9B6g== + dependencies: + dns-packet "^1.3.1" + thunky "^1.0.2" + +nanoid@^3.1.31, nanoid@^3.3.1: + version "3.3.1" + resolved "https://registry.yarnpkg.com/nanoid/-/nanoid-3.3.1.tgz#6347a18cac88af88f58af0b3594b723d5e99bb35" + integrity sha512-n6Vs/3KGyxPQd6uO0eH4Bv0ojGSUvuLlIHtC3Y0kEO23YRge8H9x1GCzLn28YX0H66pMkxuaeESFq4tKISKwdw== + +negotiator@0.6.3: + version "0.6.3" + resolved "https://registry.yarnpkg.com/negotiator/-/negotiator-0.6.3.tgz#58e323a72fedc0d6f9cd4d31fe49f51479590ccd" + integrity sha512-+EUsqGPLsM+j/zdChZjsnX51g4XrHFOIXwfnCVPGlQk/k5giakcKsuxCObBRu6DSm9opw/O6slWbJdghQM4bBg== + +neo-async@^2.6.2: + version "2.6.2" + resolved "https://registry.yarnpkg.com/neo-async/-/neo-async-2.6.2.tgz#b4aafb93e3aeb2d8174ca53cf163ab7d7308305f" + integrity sha512-Yd3UES5mWCSqR+qNT93S3UoYUkqAZ9lLg8a7g9rimsWmYGK8cVToA4/sF3RrshdyV3sAGMXVUmpMYOw+dLpOuw== + +no-case@^3.0.4: + version "3.0.4" + resolved "https://registry.yarnpkg.com/no-case/-/no-case-3.0.4.tgz#d361fd5c9800f558551a8369fc0dcd4662b6124d" + integrity sha512-fgAN3jGAh+RoxUGZHTSOLJIqUc2wmoBwGR4tbpNAKmmovFoWq0OdRkb0VkldReO2a2iBT/OEulG9XSUc10r3zg== + dependencies: + lower-case "^2.0.2" + tslib "^2.0.3" + +node-fetch@^1.0.1, node-fetch@^2.6.1: + version "2.6.7" + resolved "https://registry.yarnpkg.com/node-fetch/-/node-fetch-2.6.7.tgz#24de9fba827e3b4ae44dc8b20256a379160052ad" + integrity sha512-ZjMPFEfVx5j+y2yF35Kzx5sF7kDzxuDj6ziH4FFbOp87zKDZNx8yExJIb05OGF4Nlt9IHFIMBkRl41VdvcNdbQ== + dependencies: + whatwg-url "^5.0.0" + +node-forge@^1.2.0: + version "1.2.1" + resolved "https://registry.yarnpkg.com/node-forge/-/node-forge-1.2.1.tgz#82794919071ef2eb5c509293325cec8afd0fd53c" + integrity sha512-Fcvtbb+zBcZXbTTVwqGA5W+MKBj56UjVRevvchv5XrcyXbmNdesfZL37nlcWOfpgHhgmxApw3tQbTr4CqNmX4w== + +node-releases@^2.0.2: + version "2.0.2" + resolved "https://registry.yarnpkg.com/node-releases/-/node-releases-2.0.2.tgz#7139fe71e2f4f11b47d4d2986aaf8c48699e0c01" + integrity sha512-XxYDdcQ6eKqp/YjI+tb2C5WM2LgjnZrfYg4vgQt49EK268b6gYCHsBLrK2qvJo4FmCtqmKezb0WZFK4fkrZNsg== + +normalize-path@^3.0.0, normalize-path@~3.0.0: + version "3.0.0" + resolved "https://registry.yarnpkg.com/normalize-path/-/normalize-path-3.0.0.tgz#0dcd69ff23a1c9b11fd0978316644a0388216a65" + integrity sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA== + +npm-run-path@^4.0.1: + version "4.0.1" + resolved "https://registry.yarnpkg.com/npm-run-path/-/npm-run-path-4.0.1.tgz#b7ecd1e5ed53da8e37a55e1c2269e0b97ed748ea" + integrity sha512-S48WzZW777zhNIrn7gxOlISNAqi9ZC/uQFnRdbeIHhZhCA6UqpkOT8T1G7BvfdgP4Er8gF4sUbaS0i7QvIfCWw== + dependencies: + path-key "^3.0.0" + +nth-check@^2.0.1: + version "2.0.1" + resolved "https://registry.yarnpkg.com/nth-check/-/nth-check-2.0.1.tgz#2efe162f5c3da06a28959fbd3db75dbeea9f0fc2" + integrity sha512-it1vE95zF6dTT9lBsYbxvqh0Soy4SPowchj0UBGj/V6cTPnXXtQOPUbhZ6CmGzAD/rW22LQK6E96pcdJXk4A4w== + dependencies: + boolbase "^1.0.0" + +object-assign@^4.1.1: + version "4.1.1" + resolved "https://registry.yarnpkg.com/object-assign/-/object-assign-4.1.1.tgz#2109adc7965887cfc05cbbd442cac8bfbb360863" + integrity sha1-IQmtx5ZYh8/AXLvUQsrIv7s2CGM= + +object-is@^1.0.1: + version "1.1.5" + resolved "https://registry.yarnpkg.com/object-is/-/object-is-1.1.5.tgz#b9deeaa5fc7f1846a0faecdceec138e5778f53ac" + integrity sha512-3cyDsyHgtmi7I7DfSSI2LDp6SK2lwvtbg0p0R1e0RvTqF5ceGx+K2dfSjm1bKDMVCFEDAQvy+o8c6a7VujOddw== + dependencies: + call-bind "^1.0.2" + define-properties "^1.1.3" + +object-keys@^1.0.12, object-keys@^1.1.1: + version "1.1.1" + resolved "https://registry.yarnpkg.com/object-keys/-/object-keys-1.1.1.tgz#1c47f272df277f3b1daf061677d9c82e2322c60e" + integrity sha512-NuAESUOUMrlIXOfHKzD6bpPu3tYt3xvjNdRIQ+FeT0lNb4K8WR70CaDxhuNguS2XG+GjkyMwOzsN5ZktImfhLA== + +obuf@^1.0.0, obuf@^1.1.2: + version "1.1.2" + resolved "https://registry.yarnpkg.com/obuf/-/obuf-1.1.2.tgz#09bea3343d41859ebd446292d11c9d4db619084e" + integrity sha512-PX1wu0AmAdPqOL1mWhqmlOd8kOIZQwGZw6rh7uby9fTc5lhaOWFLX3I6R1hrF9k3zUY40e6igsLGkDXK92LJNg== + +on-finished@~2.3.0: + version "2.3.0" + resolved "https://registry.yarnpkg.com/on-finished/-/on-finished-2.3.0.tgz#20f1336481b083cd75337992a16971aa2d906947" + integrity sha1-IPEzZIGwg811M3mSoWlxqi2QaUc= + dependencies: + ee-first "1.1.1" + +on-headers@~1.0.2: + version "1.0.2" + resolved "https://registry.yarnpkg.com/on-headers/-/on-headers-1.0.2.tgz#772b0ae6aaa525c399e489adfad90c403eb3c28f" + integrity sha512-pZAE+FJLoyITytdqK0U5s+FIpjN0JP3OzFi/u8Rx+EV5/W+JTWGXG8xFzevE7AjBfDqHv/8vL8qQsIhHnqRkrA== + +once@^1.3.0: + version "1.4.0" + resolved "https://registry.yarnpkg.com/once/-/once-1.4.0.tgz#583b1aa775961d4b113ac17d9c50baef9dd76bd1" + integrity sha1-WDsap3WWHUsROsF9nFC6753Xa9E= + dependencies: + wrappy "1" + +onetime@^5.1.2: + version "5.1.2" + resolved "https://registry.yarnpkg.com/onetime/-/onetime-5.1.2.tgz#d0e96ebb56b07476df1dd9c4806e5237985ca45e" + integrity sha512-kbpaSSGJTWdAY5KPVeMOKXSrPtr8C8C7wodJbcsd51jRnmD+GZu8Y0VoU6Dm5Z4vWr0Ig/1NKuWRKf7j5aaYSg== + dependencies: + mimic-fn "^2.1.0" + +open@^8.0.9: + version "8.4.0" + resolved "https://registry.yarnpkg.com/open/-/open-8.4.0.tgz#345321ae18f8138f82565a910fdc6b39e8c244f8" + integrity sha512-XgFPPM+B28FtCCgSb9I+s9szOC1vZRSwgWsRUA5ylIxRTgKozqjOCrVOqGsYABPYK5qnfqClxZTFBa8PKt2v6Q== + dependencies: + define-lazy-prop "^2.0.0" + is-docker "^2.1.1" + is-wsl "^2.2.0" + +p-limit@^2.2.0: + version "2.3.0" + resolved "https://registry.yarnpkg.com/p-limit/-/p-limit-2.3.0.tgz#3dd33c647a214fdfffd835933eb086da0dc21db1" + integrity sha512-//88mFWSJx8lxCzwdAABTJL2MyWB12+eIY7MDL2SqLmAkeKU9qxRvWuSyTjm3FUmpBEMuFfckAIqEaVGUDxb6w== + dependencies: + p-try "^2.0.0" + +p-locate@^4.1.0: + version "4.1.0" + resolved "https://registry.yarnpkg.com/p-locate/-/p-locate-4.1.0.tgz#a3428bb7088b3a60292f66919278b7c297ad4f07" + integrity sha512-R79ZZ/0wAxKGu3oYMlz8jy/kbhsNrS7SKZ7PxEHBgJ5+F2mtFW2fK2cOtBh1cHYkQsbzFV7I+EoRKe6Yt0oK7A== + dependencies: + p-limit "^2.2.0" + +p-map@^4.0.0: + version "4.0.0" + resolved "https://registry.yarnpkg.com/p-map/-/p-map-4.0.0.tgz#bb2f95a5eda2ec168ec9274e06a747c3e2904d2b" + integrity sha512-/bjOqmgETBYB5BoEeGVea8dmvHb2m9GLy1E9W43yeyfP6QQCZGFNa+XRceJEuDB6zqr+gKpIAmlLebMpykw/MQ== + dependencies: + aggregate-error "^3.0.0" + +p-retry@^4.5.0: + version "4.6.1" + resolved "https://registry.yarnpkg.com/p-retry/-/p-retry-4.6.1.tgz#8fcddd5cdf7a67a0911a9cf2ef0e5df7f602316c" + integrity sha512-e2xXGNhZOZ0lfgR9kL34iGlU8N/KO0xZnQxVEwdeOvpqNDQfdnxIYizvWtK8RglUa3bGqI8g0R/BdfzLMxRkiA== + dependencies: + "@types/retry" "^0.12.0" + retry "^0.13.1" + +p-try@^2.0.0: + version "2.2.0" + resolved "https://registry.yarnpkg.com/p-try/-/p-try-2.2.0.tgz#cb2868540e313d61de58fafbe35ce9004d5540e6" + integrity sha512-R4nPAVTAU0B9D35/Gk3uJf/7XYbQcyohSKdvAxIRSNghFl4e71hVoGnBNQz9cWaXxO2I10KTC+3jMdvvoKw6dQ== + +param-case@^3.0.4: + version "3.0.4" + resolved "https://registry.yarnpkg.com/param-case/-/param-case-3.0.4.tgz#7d17fe4aa12bde34d4a77d91acfb6219caad01c5" + integrity sha512-RXlj7zCYokReqWpOPH9oYivUzLYZ5vAPIfEmCTNViosC78F8F0H9y7T7gG2M39ymgutxF5gcFEsyZQSph9Bp3A== + dependencies: + dot-case "^3.0.4" + tslib "^2.0.3" + +parseurl@~1.3.2, parseurl@~1.3.3: + version "1.3.3" + resolved "https://registry.yarnpkg.com/parseurl/-/parseurl-1.3.3.tgz#9da19e7bee8d12dff0513ed5b76957793bc2e8d4" + integrity sha512-CiyeOxFT/JZyN5m0z9PfXw4SCBJ6Sygz1Dpl0wqjlhDEGGBP1GnsUVEL0p63hoG1fcj3fHynXi9NYO4nWOL+qQ== + +pascal-case@^3.1.2: + version "3.1.2" + resolved "https://registry.yarnpkg.com/pascal-case/-/pascal-case-3.1.2.tgz#b48e0ef2b98e205e7c1dae747d0b1508237660eb" + integrity sha512-uWlGT3YSnK9x3BQJaOdcZwrnV6hPpd8jFH1/ucpiLRPh/2zCVJKS19E4GvYHvaCcACn3foXZ0cLB9Wrx1KGe5g== + dependencies: + no-case "^3.0.4" + tslib "^2.0.3" + +path-exists@^4.0.0: + version "4.0.0" + resolved "https://registry.yarnpkg.com/path-exists/-/path-exists-4.0.0.tgz#513bdbe2d3b95d7762e8c1137efa195c6c61b5b3" + integrity sha512-ak9Qy5Q7jYb2Wwcey5Fpvg2KoAc/ZIhLSLOSBmRmygPsGwkVVt0fZa0qrtMz+m6tJTAHfZQ8FnmB4MG4LWy7/w== + +path-is-absolute@^1.0.0: + version "1.0.1" + resolved "https://registry.yarnpkg.com/path-is-absolute/-/path-is-absolute-1.0.1.tgz#174b9268735534ffbc7ace6bf53a5a9e1b5c5f5f" + integrity sha1-F0uSaHNVNP+8es5r9TpanhtcX18= + +path-key@^3.0.0, path-key@^3.1.0: + version "3.1.1" + resolved "https://registry.yarnpkg.com/path-key/-/path-key-3.1.1.tgz#581f6ade658cbba65a0d3380de7753295054f375" + integrity sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q== + +path-parse@^1.0.7: + version "1.0.7" + resolved "https://registry.yarnpkg.com/path-parse/-/path-parse-1.0.7.tgz#fbc114b60ca42b30d9daf5858e4bd68bbedb6735" + integrity sha512-LDJzPVEEEPR+y48z93A0Ed0yXb8pAByGWo/k5YYdYgpY2/2EsOsksJrq7lOHxryrVOn1ejG6oAp8ahvOIQD8sw== + +path-to-regexp@0.1.7: + version "0.1.7" + resolved "https://registry.yarnpkg.com/path-to-regexp/-/path-to-regexp-0.1.7.tgz#df604178005f522f15eb4490e7247a1bfaa67f8c" + integrity sha1-32BBeABfUi8V60SQ5yR6G/qmf4w= + +path-type@^4.0.0: + version "4.0.0" + resolved "https://registry.yarnpkg.com/path-type/-/path-type-4.0.0.tgz#84ed01c0a7ba380afe09d90a8c180dcd9d03043b" + integrity sha512-gDKb8aZMDeD/tZWs9P6+q0J9Mwkdl6xMV8TjnGP3qJVJ06bdMgkbBlLU8IdfOsIsFz2BW1rNVT3XuNEl8zPAvw== + +picocolors@^1.0.0: + version "1.0.0" + resolved "https://registry.yarnpkg.com/picocolors/-/picocolors-1.0.0.tgz#cb5bdc74ff3f51892236eaf79d68bc44564ab81c" + integrity sha512-1fygroTLlHu66zi26VoTDv8yRgm0Fccecssto+MhsZ0D/DGW2sm8E8AjW7NU5VVTRt5GxbeZ5qBuJr+HyLYkjQ== + +picomatch@^2.0.4, picomatch@^2.2.1, picomatch@^2.2.3: + version "2.3.1" + resolved "https://registry.yarnpkg.com/picomatch/-/picomatch-2.3.1.tgz#3ba3833733646d9d3e4995946c1365a67fb07a42" + integrity sha512-JU3teHTNjmE2VCGFzuY8EXzCDVwEqB2a8fsIvwaStHhAWJEeVd1o1QD80CU6+ZdEXXSLbSsuLwJjkCBWqRQUVA== + +pkg-dir@^4.2.0: + version "4.2.0" + resolved "https://registry.yarnpkg.com/pkg-dir/-/pkg-dir-4.2.0.tgz#f099133df7ede422e81d1d8448270eeb3e4261f3" + integrity sha512-HRDzbaKjC+AOWVXxAU/x54COGeIv9eb+6CkDSQoNTt4XyWoIJvuPsXizxu/Fr23EiekbtZwmh1IcIG/l/a10GQ== + dependencies: + find-up "^4.0.0" + +popper.js@1.16.1-lts: + version "1.16.1-lts" + resolved "https://registry.yarnpkg.com/popper.js/-/popper.js-1.16.1-lts.tgz#cf6847b807da3799d80ee3d6d2f90df8a3f50b05" + integrity sha512-Kjw8nKRl1m+VrSFCoVGPph93W/qrSO7ZkqPpTf7F4bk/sqcfWK019dWBUpE/fBOsOQY1dks/Bmcbfn1heM/IsA== + +portable-fetch@^3.0.0: + version "3.0.0" + resolved "https://registry.yarnpkg.com/portable-fetch/-/portable-fetch-3.0.0.tgz#3cbf4aa6dbc5a5734b41c0419c9273313bfd9ad8" + integrity sha1-PL9KptvFpXNLQcBBnJJzMTv9mtg= + dependencies: + node-fetch "^1.0.1" + whatwg-fetch ">=0.10.0" + +portfinder@^1.0.28: + version "1.0.28" + resolved "https://registry.yarnpkg.com/portfinder/-/portfinder-1.0.28.tgz#67c4622852bd5374dd1dd900f779f53462fac778" + integrity sha512-Se+2isanIcEqf2XMHjyUKskczxbPH7dQnlMjXX6+dybayyHvAf/TCgyMRlzf/B6QDhAEFOGes0pzRo3by4AbMA== + dependencies: + async "^2.6.2" + debug "^3.1.1" + mkdirp "^0.5.5" + +postcss-modules-extract-imports@^3.0.0: + version "3.0.0" + resolved "https://registry.yarnpkg.com/postcss-modules-extract-imports/-/postcss-modules-extract-imports-3.0.0.tgz#cda1f047c0ae80c97dbe28c3e76a43b88025741d" + integrity sha512-bdHleFnP3kZ4NYDhuGlVK+CMrQ/pqUm8bx/oGL93K6gVwiclvX5x0n76fYMKuIGKzlABOy13zsvqjb0f92TEXw== + +postcss-modules-local-by-default@^4.0.0: + version "4.0.0" + resolved "https://registry.yarnpkg.com/postcss-modules-local-by-default/-/postcss-modules-local-by-default-4.0.0.tgz#ebbb54fae1598eecfdf691a02b3ff3b390a5a51c" + integrity sha512-sT7ihtmGSF9yhm6ggikHdV0hlziDTX7oFoXtuVWeDd3hHObNkcHRo9V3yg7vCAY7cONyxJC/XXCmmiHHcvX7bQ== + dependencies: + icss-utils "^5.0.0" + postcss-selector-parser "^6.0.2" + postcss-value-parser "^4.1.0" + +postcss-modules-scope@^3.0.0: + version "3.0.0" + resolved "https://registry.yarnpkg.com/postcss-modules-scope/-/postcss-modules-scope-3.0.0.tgz#9ef3151456d3bbfa120ca44898dfca6f2fa01f06" + integrity sha512-hncihwFA2yPath8oZ15PZqvWGkWf+XUfQgUGamS4LqoP1anQLOsOJw0vr7J7IwLpoY9fatA2qiGUGmuZL0Iqlg== + dependencies: + postcss-selector-parser "^6.0.4" + +postcss-modules-values@^4.0.0: + version "4.0.0" + resolved "https://registry.yarnpkg.com/postcss-modules-values/-/postcss-modules-values-4.0.0.tgz#d7c5e7e68c3bb3c9b27cbf48ca0bb3ffb4602c9c" + integrity sha512-RDxHkAiEGI78gS2ofyvCsu7iycRv7oqw5xMWn9iMoR0N/7mf9D50ecQqUo5BZ9Zh2vH4bCUR/ktCqbB9m8vJjQ== + dependencies: + icss-utils "^5.0.0" + +postcss-selector-parser@^6.0.2, postcss-selector-parser@^6.0.4: + version "6.0.9" + resolved "https://registry.yarnpkg.com/postcss-selector-parser/-/postcss-selector-parser-6.0.9.tgz#ee71c3b9ff63d9cd130838876c13a2ec1a992b2f" + integrity sha512-UO3SgnZOVTwu4kyLR22UQ1xZh086RyNZppb7lLAKBFK8a32ttG5i87Y/P3+2bRSjZNyJ1B7hfFNo273tKe9YxQ== + dependencies: + cssesc "^3.0.0" + util-deprecate "^1.0.2" + +postcss-value-parser@^4.1.0: + version "4.2.0" + resolved "https://registry.yarnpkg.com/postcss-value-parser/-/postcss-value-parser-4.2.0.tgz#723c09920836ba6d3e5af019f92bc0971c02e514" + integrity sha512-1NNCs6uurfkVbeXG4S8JFT9t19m45ICnif8zWLd5oPSZ50QnwMfK+H3jv408d4jw/7Bttv5axS5IiHoLaVNHeQ== + +postcss@^8.2.15: + version "8.4.8" + resolved "https://registry.yarnpkg.com/postcss/-/postcss-8.4.8.tgz#dad963a76e82c081a0657d3a2f3602ce10c2e032" + integrity sha512-2tXEqGxrjvAO6U+CJzDL2Fk2kPHTv1jQsYkSoMeOis2SsYaXRO2COxTdQp99cYvif9JTXaAk9lYGc3VhJt7JPQ== + dependencies: + nanoid "^3.3.1" + picocolors "^1.0.0" + source-map-js "^1.0.2" + +prettier@^2.1.2: + version "2.5.1" + resolved "https://registry.yarnpkg.com/prettier/-/prettier-2.5.1.tgz#fff75fa9d519c54cf0fce328c1017d94546bc56a" + integrity sha512-vBZcPRUR5MZJwoyi3ZoyQlc1rXeEck8KgeC9AwwOn+exuxLxq5toTRDTSaVrXHxelDMHy9zlicw8u66yxoSUFg== + +pretty-error@^4.0.0: + version "4.0.0" + resolved "https://registry.yarnpkg.com/pretty-error/-/pretty-error-4.0.0.tgz#90a703f46dd7234adb46d0f84823e9d1cb8f10d6" + integrity sha512-AoJ5YMAcXKYxKhuJGdcvse+Voc6v1RgnsR3nWcYU7q4t6z0Q6T86sv5Zq8VIRbOWWFpvdGE83LtdSMNd+6Y0xw== + dependencies: + lodash "^4.17.20" + renderkid "^3.0.0" + +process-nextick-args@~2.0.0: + version "2.0.1" + resolved "https://registry.yarnpkg.com/process-nextick-args/-/process-nextick-args-2.0.1.tgz#7820d9b16120cc55ca9ae7792680ae7dba6d7fe2" + integrity sha512-3ouUOpQhtgrbOa17J7+uxOTpITYWaGP7/AhoR3+A+/1e9skrzelGi/dXzEYyvbxubEF6Wn2ypscTKiKJFFn1ag== + +prop-types@^15.6.2, prop-types@^15.7.2: + version "15.8.1" + resolved "https://registry.yarnpkg.com/prop-types/-/prop-types-15.8.1.tgz#67d87bf1a694f48435cf332c24af10214a3140b5" + integrity sha512-oj87CgZICdulUohogVAR7AjlC0327U4el4L6eAvOqCeudMDVU0NThNaV+b9Df4dXgSP1gXMTnPdhfe/2qDH5cg== + dependencies: + loose-envify "^1.4.0" + object-assign "^4.1.1" + react-is "^16.13.1" + +proxy-addr@~2.0.7: + version "2.0.7" + resolved "https://registry.yarnpkg.com/proxy-addr/-/proxy-addr-2.0.7.tgz#f19fe69ceab311eeb94b42e70e8c2070f9ba1025" + integrity sha512-llQsMLSUDUPT44jdrU/O37qlnifitDP+ZwrmmZcoSKyLKvtZxpyV0n2/bD/N4tBAAZ/gJEdZU7KMraoK1+XYAg== + dependencies: + forwarded "0.2.0" + ipaddr.js "1.9.1" + +prr@~1.0.1: + version "1.0.1" + resolved "https://registry.yarnpkg.com/prr/-/prr-1.0.1.tgz#d3fc114ba06995a45ec6893f484ceb1d78f5f476" + integrity sha1-0/wRS6BplaRexok/SEzrHXj19HY= + +punycode@^2.1.0: + version "2.1.1" + resolved "https://registry.yarnpkg.com/punycode/-/punycode-2.1.1.tgz#b58b010ac40c22c5657616c8d2c2c02c7bf479ec" + integrity sha512-XRsRjdf+j5ml+y/6GKHPZbrF/8p2Yga0JPtdqTIY2Xe5ohJPD9saDJJLPvp9+NSBprVvevdXZybnj2cv8OEd0A== + +qs@6.9.7: + version "6.9.7" + resolved "https://registry.yarnpkg.com/qs/-/qs-6.9.7.tgz#4610846871485e1e048f44ae3b94033f0e675afe" + integrity sha512-IhMFgUmuNpyRfxA90umL7ByLlgRXu6tIfKPpF5TmcfRLlLCckfP/g3IQmju6jjpu+Hh8rA+2p6A27ZSPOOHdKw== + +queue-microtask@^1.2.2: + version "1.2.3" + resolved "https://registry.yarnpkg.com/queue-microtask/-/queue-microtask-1.2.3.tgz#4929228bbc724dfac43e0efb058caf7b6cfb6243" + integrity sha512-NuaNSa6flKT5JaSYQzJok04JzTL1CA6aGhv5rfLW3PgqA+M2ChpZQnAC8h8i4ZFkBS8X5RqkDBHA7r4hej3K9A== + +randombytes@^2.1.0: + version "2.1.0" + resolved "https://registry.yarnpkg.com/randombytes/-/randombytes-2.1.0.tgz#df6f84372f0270dc65cdf6291349ab7a473d4f2a" + integrity sha512-vYl3iOX+4CKUWuxGi9Ukhie6fsqXqS9FE2Zaic4tNFD2N2QQaXOMFbuKK4QmDHC0JO6B1Zp41J0LpT0oR68amQ== + dependencies: + safe-buffer "^5.1.0" + +range-parser@^1.2.1, range-parser@~1.2.1: + version "1.2.1" + resolved "https://registry.yarnpkg.com/range-parser/-/range-parser-1.2.1.tgz#3cf37023d199e1c24d1a55b84800c2f3e6468031" + integrity sha512-Hrgsx+orqoygnmhFbKaHE6c296J+HTAQXoxEF6gNupROmmGJRoyzfG3ccAveqCBrwr/2yxQ5BVd/GTl5agOwSg== + +raw-body@2.4.3: + version "2.4.3" + resolved "https://registry.yarnpkg.com/raw-body/-/raw-body-2.4.3.tgz#8f80305d11c2a0a545c2d9d89d7a0286fcead43c" + integrity sha512-UlTNLIcu0uzb4D2f4WltY6cVjLi+/jEN4lgEUj3E04tpMDpUlkBo/eSn6zou9hum2VMNpCCUone0O0WeJim07g== + dependencies: + bytes "3.1.2" + http-errors "1.8.1" + iconv-lite "0.4.24" + unpipe "1.0.0" + +rc-align@^4.0.0: + version "4.0.11" + resolved "https://registry.yarnpkg.com/rc-align/-/rc-align-4.0.11.tgz#8198c62db266bc1b8ef05e56c13275bf72628a5e" + integrity sha512-n9mQfIYQbbNTbefyQnRHZPWuTEwG1rY4a9yKlIWHSTbgwI+XUMGRYd0uJ5pE2UbrNX0WvnMBA1zJ3Lrecpra/A== + dependencies: + "@babel/runtime" "^7.10.1" + classnames "2.x" + dom-align "^1.7.0" + lodash "^4.17.21" + rc-util "^5.3.0" + resize-observer-polyfill "^1.5.1" + +rc-cascader@~3.2.1: + version "3.2.7" + resolved "https://registry.yarnpkg.com/rc-cascader/-/rc-cascader-3.2.7.tgz#74ac3ab9258f930e0c84dfacffd838b122b2cedf" + integrity sha512-M8VtKtifTXXo/qqXj63p12tsMNXm1z45Lytj7tu86L6gxIF8keDPcJ16/ZqrhS5JwlBPfoJNA1VooNl/KId15A== + dependencies: + "@babel/runtime" "^7.12.5" + array-tree-filter "^2.1.0" + classnames "^2.3.1" + rc-select "~14.0.0-alpha.23" + rc-tree "~5.4.3" + rc-util "^5.6.1" + +rc-checkbox@~2.3.0: + version "2.3.2" + resolved "https://registry.yarnpkg.com/rc-checkbox/-/rc-checkbox-2.3.2.tgz#f91b3678c7edb2baa8121c9483c664fa6f0aefc1" + integrity sha512-afVi1FYiGv1U0JlpNH/UaEXdh6WUJjcWokj/nUN2TgG80bfG+MDdbfHKlLcNNba94mbjy2/SXJ1HDgrOkXGAjg== + dependencies: + "@babel/runtime" "^7.10.1" + classnames "^2.2.1" + +rc-collapse@~3.1.0: + version "3.1.2" + resolved "https://registry.yarnpkg.com/rc-collapse/-/rc-collapse-3.1.2.tgz#76028a811b845d03d9460ccc409c7ea8ad09db14" + integrity sha512-HujcKq7mghk/gVKeI6EjzTbb8e19XUZpakrYazu1MblEZ3Hu3WBMSN4A3QmvbF6n1g7x6lUlZvsHZ5shABWYOQ== + dependencies: + "@babel/runtime" "^7.10.1" + classnames "2.x" + rc-motion "^2.3.4" + rc-util "^5.2.1" + shallowequal "^1.1.0" + +rc-dialog@~8.6.0: + version "8.6.0" + resolved "https://registry.yarnpkg.com/rc-dialog/-/rc-dialog-8.6.0.tgz#3b228dac085de5eed8c6237f31162104687442e7" + integrity sha512-GSbkfqjqxpZC5/zc+8H332+q5l/DKUhpQr0vdX2uDsxo5K0PhvaMEVjyoJUTkZ3+JstEADQji1PVLVb/2bJeOQ== + dependencies: + "@babel/runtime" "^7.10.1" + classnames "^2.2.6" + rc-motion "^2.3.0" + rc-util "^5.6.1" + +rc-drawer@~4.4.2: + version "4.4.3" + resolved "https://registry.yarnpkg.com/rc-drawer/-/rc-drawer-4.4.3.tgz#2094937a844e55dc9644236a2d9fba79c344e321" + integrity sha512-FYztwRs3uXnFOIf1hLvFxIQP9MiZJA+0w+Os8dfDh/90X7z/HqP/Yg+noLCIeHEbKln1Tqelv8ymCAN24zPcfQ== + dependencies: + "@babel/runtime" "^7.10.1" + classnames "^2.2.6" + rc-util "^5.7.0" + +rc-dropdown@^3.2.0, rc-dropdown@~3.3.2: + version "3.3.2" + resolved "https://registry.yarnpkg.com/rc-dropdown/-/rc-dropdown-3.3.2.tgz#097c2ec1b6d55c10eeb94dcf6120ba034c7a58e0" + integrity sha512-49GOz42oNvLtYGoJ2X5UWXJFp7aUiSZkj9OcgTV1UpxFZqHQMw+xijkaL5k3XDkMbb92XsuFnFt7IGG3/C0DKw== + dependencies: + "@babel/runtime" "^7.10.1" + classnames "^2.2.6" + rc-trigger "^5.0.4" + +rc-field-form@~1.23.0: + version "1.23.1" + resolved "https://registry.yarnpkg.com/rc-field-form/-/rc-field-form-1.23.1.tgz#638c11d05d7ed2efdcb862ff3da5fe2a7d199aaa" + integrity sha512-Mun+eaFmX1Pjud9bz0fD0IvxwDfFKWk2Q8tkt4sg4aKR9/FML/rzYC5MjY77p86X45XBurBDUR3gAda+Cg/ULw== + dependencies: + "@babel/runtime" "^7.8.4" + async-validator "^4.0.2" + rc-util "^5.8.0" + +rc-image@~5.2.5: + version "5.2.5" + resolved "https://registry.yarnpkg.com/rc-image/-/rc-image-5.2.5.tgz#44e6ffc842626827960e7ab72e1c0d6f3a8ce440" + integrity sha512-qUfZjYIODxO0c8a8P5GeuclYXZjzW4hV/5hyo27XqSFo1DmTCs2HkVeQObkcIk5kNsJtgsj1KoPThVsSc/PXOw== + dependencies: + "@babel/runtime" "^7.11.2" + classnames "^2.2.6" + rc-dialog "~8.6.0" + rc-util "^5.0.6" + +rc-input-number@~7.3.0: + version "7.3.4" + resolved "https://registry.yarnpkg.com/rc-input-number/-/rc-input-number-7.3.4.tgz#674aea98260250287d36e330a7e065b174486e9d" + integrity sha512-W9uqSzuvJUnz8H8vsVY4kx+yK51SsAxNTwr8SNH4G3XqQNocLVmKIibKFRjocnYX1RDHMND9FFbgj2h7E7nvGA== + dependencies: + "@babel/runtime" "^7.10.1" + classnames "^2.2.5" + rc-util "^5.9.8" + +rc-input@^0.0.1-alpha.5: + version "0.0.1-alpha.5" + resolved "https://registry.yarnpkg.com/rc-input/-/rc-input-0.0.1-alpha.5.tgz#cc043c44570c651f4d10d9809b3d634ed12537e6" + integrity sha512-RHvNweOVWFbbx2l/y6hgnSAdOg5fXc1D1VGhX2RNkGGyGr6cemnvyiYMxwZJjcXs0al3YK9jMObm20+DgH/mpw== + dependencies: + "@babel/runtime" "^7.11.1" + classnames "^2.2.1" + rc-util "^5.18.1" + +rc-mentions@~1.6.1: + version "1.6.2" + resolved "https://registry.yarnpkg.com/rc-mentions/-/rc-mentions-1.6.2.tgz#62ed7cdd8fa86d857c3ce3f9e73438022130815e" + integrity sha512-cntfJkNMq8B910rXuvnsnOV88DfmoUidnQnSIeXzWiYiUX4RL5oWUfSZzs+HAXYRU4SL1l8Mwjx95wHETiZ/fQ== + dependencies: + "@babel/runtime" "^7.10.1" + classnames "^2.2.6" + rc-menu "^9.0.0" + rc-textarea "^0.3.0" + rc-trigger "^5.0.4" + rc-util "^5.0.1" + +rc-menu@^9.0.0: + version "9.3.2" + resolved "https://registry.yarnpkg.com/rc-menu/-/rc-menu-9.3.2.tgz#bb842d37ebf71da912bea201cf7ef0a27267ad49" + integrity sha512-h3m45oY1INZyqphGELkdT0uiPnFzxkML8m0VMhJnk2fowtqfiT7F5tJLT3znEVaPIY80vMy1bClCkgq8U91CzQ== + dependencies: + "@babel/runtime" "^7.10.1" + classnames "2.x" + rc-motion "^2.4.3" + rc-overflow "^1.2.0" + rc-trigger "^5.1.2" + rc-util "^5.12.0" + shallowequal "^1.1.0" + +rc-menu@~9.2.1: + version "9.2.1" + resolved "https://registry.yarnpkg.com/rc-menu/-/rc-menu-9.2.1.tgz#6fbe47f4846363bb81a5a21f0960026c3ada497a" + integrity sha512-UbEtn3rflJ8zS+etYGTVQuzy7Fm+yWXR5c0Rl6ecNTS/dPknRyWAyhJcbeR0Hu1+RdQT+0VCqrUPrgKnm4iY+w== + dependencies: + "@babel/runtime" "^7.10.1" + classnames "2.x" + rc-motion "^2.4.3" + rc-overflow "^1.2.0" + rc-trigger "^5.1.2" + rc-util "^5.12.0" + shallowequal "^1.1.0" + +rc-motion@^2.0.0, rc-motion@^2.0.1, rc-motion@^2.2.0, rc-motion@^2.3.0, rc-motion@^2.3.4, rc-motion@^2.4.3, rc-motion@^2.4.4: + version "2.4.5" + resolved "https://registry.yarnpkg.com/rc-motion/-/rc-motion-2.4.5.tgz#b061c50bb29ecd3d735d5f4c40924a3c78226cbd" + integrity sha512-f3uJHR4gcpeZS/s8/nYFSOrXt2Wu/h9GrEcbJmC0qmKrVNgwL1pTgrT5kW7lgG6PFeoL4yHDmpQoEKkrPtKIzQ== + dependencies: + "@babel/runtime" "^7.11.1" + classnames "^2.2.1" + rc-util "^5.18.1" + +rc-notification@~4.5.7: + version "4.5.7" + resolved "https://registry.yarnpkg.com/rc-notification/-/rc-notification-4.5.7.tgz#265e6e6a0c1a0fac63d6abd4d832eb8ff31522f1" + integrity sha512-zhTGUjBIItbx96SiRu3KVURcLOydLUHZCPpYEn1zvh+re//Tnq/wSxN4FKgp38n4HOgHSVxcLEeSxBMTeBBDdw== + dependencies: + "@babel/runtime" "^7.10.1" + classnames "2.x" + rc-motion "^2.2.0" + rc-util "^5.0.1" + +rc-overflow@^1.0.0, rc-overflow@^1.2.0: + version "1.2.3" + resolved "https://registry.yarnpkg.com/rc-overflow/-/rc-overflow-1.2.3.tgz#1754216d807f5473304272b0321c3aba7615f47a" + integrity sha512-Bz6dXTn/ww8nmu70tUQfRV0wT3BkfXY6j1lB1O38OVkDPz4xwfAcGK+LJ2zewUR5cTXkJ8hAN7YULohG8z4M7Q== + dependencies: + "@babel/runtime" "^7.11.1" + classnames "^2.2.1" + rc-resize-observer "^1.0.0" + rc-util "^5.15.0" + +rc-pagination@~3.1.9: + version "3.1.15" + resolved "https://registry.yarnpkg.com/rc-pagination/-/rc-pagination-3.1.15.tgz#e05eddf4c15717a5858290bed0857e27e2f957ff" + integrity sha512-4L3fot8g4E+PjWEgoVGX0noFCg+8ZFZmeLH4vsnZpB3O2T2zThtakjNxG+YvSaYtyMVT4B+GLayjKrKbXQpdAg== + dependencies: + "@babel/runtime" "^7.10.1" + classnames "^2.2.1" + +rc-picker@~2.6.4: + version "2.6.4" + resolved "https://registry.yarnpkg.com/rc-picker/-/rc-picker-2.6.4.tgz#916aa5fcd8abd11106f1c2fb64bfd549439abfa0" + integrity sha512-Mnc1udPyGNSG7/ya5SmYltUjCUcsMH7jfJnuuXVAvEaEdx9qZxDGMWtIii//+ARC06CSHQ83s5iwiGFwM+FcDw== + dependencies: + "@babel/runtime" "^7.10.1" + classnames "^2.2.1" + date-fns "2.x" + dayjs "1.x" + moment "^2.24.0" + rc-trigger "^5.0.4" + rc-util "^5.4.0" + shallowequal "^1.1.0" + +rc-progress@~3.2.1: + version "3.2.4" + resolved "https://registry.yarnpkg.com/rc-progress/-/rc-progress-3.2.4.tgz#4036acdae2566438545bc4df2203248babaf7549" + integrity sha512-M9WWutRaoVkPUPIrTpRIDpX0SPSrVHzxHdCRCbeoBFrd9UFWTYNWRlHsruJM5FH1AZI+BwB4wOJUNNylg/uFSw== + dependencies: + "@babel/runtime" "^7.10.1" + classnames "^2.2.6" + rc-util "^5.16.1" + +rc-rate@~2.9.0: + version "2.9.1" + resolved "https://registry.yarnpkg.com/rc-rate/-/rc-rate-2.9.1.tgz#e43cb95c4eb90a2c1e0b16ec6614d8c43530a731" + integrity sha512-MmIU7FT8W4LYRRHJD1sgG366qKtSaKb67D0/vVvJYR0lrCuRrCiVQ5qhfT5ghVO4wuVIORGpZs7ZKaYu+KMUzA== + dependencies: + "@babel/runtime" "^7.10.1" + classnames "^2.2.5" + rc-util "^5.0.1" + +rc-resize-observer@^1.0.0, rc-resize-observer@^1.1.0, rc-resize-observer@^1.2.0: + version "1.2.0" + resolved "https://registry.yarnpkg.com/rc-resize-observer/-/rc-resize-observer-1.2.0.tgz#9f46052f81cdf03498be35144cb7c53fd282c4c7" + integrity sha512-6W+UzT3PyDM0wVCEHfoW3qTHPTvbdSgiA43buiy8PzmeMnfgnDeb9NjdimMXMl3/TcrvvWl5RRVdp+NqcR47pQ== + dependencies: + "@babel/runtime" "^7.10.1" + classnames "^2.2.1" + rc-util "^5.15.0" + resize-observer-polyfill "^1.5.1" + +rc-select@~14.0.0-alpha.15, rc-select@~14.0.0-alpha.23, rc-select@~14.0.0-alpha.8: + version "14.0.0" + resolved "https://registry.yarnpkg.com/rc-select/-/rc-select-14.0.0.tgz#87735dbc548f1cc8e94d579b21682ed2d34f7653" + integrity sha512-DkoWMhyxmrfpc1KJSqPORZdkKevzgOINvjR4WI+dibRe6i6DyqGB4Jk21sencnK9di6dumzOCHf93x9t9+gp3Q== + dependencies: + "@babel/runtime" "^7.10.1" + classnames "2.x" + rc-motion "^2.0.1" + rc-overflow "^1.0.0" + rc-trigger "^5.0.4" + rc-util "^5.16.1" + rc-virtual-list "^3.2.0" + +rc-slider@~10.0.0-alpha.4: + version "10.0.0-alpha.4" + resolved "https://registry.yarnpkg.com/rc-slider/-/rc-slider-10.0.0-alpha.4.tgz#f14ec0905d53f1f9d7f495c301527d6eca5781cf" + integrity sha512-ih2xwkBgXAWAf7MjZIZyCiiWo6tnoIMuHifn0UeKXVAup7sH53QdSVvT9x/cysuSZIPNMYWEf6mec184n3gbiQ== + dependencies: + "@babel/runtime" "^7.10.1" + classnames "^2.2.5" + rc-tooltip "^5.0.1" + rc-util "^5.18.1" + shallowequal "^1.1.0" + +rc-steps@~4.1.0: + version "4.1.4" + resolved "https://registry.yarnpkg.com/rc-steps/-/rc-steps-4.1.4.tgz#0ba82db202d59ca52d0693dc9880dd145b19dc23" + integrity sha512-qoCqKZWSpkh/b03ASGx1WhpKnuZcRWmvuW+ZUu4mvMdfvFzVxblTwUM+9aBd0mlEUFmt6GW8FXhMpHkK3Uzp3w== + dependencies: + "@babel/runtime" "^7.10.2" + classnames "^2.2.3" + rc-util "^5.0.1" + +rc-switch@~3.2.0: + version "3.2.2" + resolved "https://registry.yarnpkg.com/rc-switch/-/rc-switch-3.2.2.tgz#d001f77f12664d52595b4f6fb425dd9e66fba8e8" + integrity sha512-+gUJClsZZzvAHGy1vZfnwySxj+MjLlGRyXKXScrtCTcmiYNPzxDFOxdQ/3pK1Kt/0POvwJ/6ALOR8gwdXGhs+A== + dependencies: + "@babel/runtime" "^7.10.1" + classnames "^2.2.1" + rc-util "^5.0.1" + +rc-table@~7.23.0: + version "7.23.0" + resolved "https://registry.yarnpkg.com/rc-table/-/rc-table-7.23.0.tgz#e5f76998ecf3246147d45ed311417c08886e6507" + integrity sha512-Q1gneB2+lUa8EzCCfbrq+jO1qNSwQv1RUUXKB84W/Stdp4EvGOt2+QqGyfotMNM4JUw0fgGLwY+WjnhUhnLuQQ== + dependencies: + "@babel/runtime" "^7.10.1" + classnames "^2.2.5" + rc-resize-observer "^1.1.0" + rc-util "^5.14.0" + shallowequal "^1.1.0" + +rc-tabs@~11.10.0: + version "11.10.7" + resolved "https://registry.yarnpkg.com/rc-tabs/-/rc-tabs-11.10.7.tgz#7d8b5dcc17f1608cf3b9425d80069f1415479335" + integrity sha512-7IKmcU7QU3CdYnJTabeXs2DDeLiXLyALC8fvOtgyWWFXUD47G5vG+4bFO3f9+AI+rcFAPpfwapZbXxgmiRuWYQ== + dependencies: + "@babel/runtime" "^7.11.2" + classnames "2.x" + rc-dropdown "^3.2.0" + rc-menu "^9.0.0" + rc-resize-observer "^1.0.0" + rc-util "^5.5.0" + +rc-textarea@^0.3.0, rc-textarea@~0.3.0: + version "0.3.7" + resolved "https://registry.yarnpkg.com/rc-textarea/-/rc-textarea-0.3.7.tgz#987142891efdedb774883c07e2f51b318fde5a11" + integrity sha512-yCdZ6binKmAQB13hc/oehh0E/QRwoPP1pjF21aHBxlgXO3RzPF6dUu4LG2R4FZ1zx/fQd2L1faktulrXOM/2rw== + dependencies: + "@babel/runtime" "^7.10.1" + classnames "^2.2.1" + rc-resize-observer "^1.0.0" + rc-util "^5.7.0" + shallowequal "^1.1.0" + +rc-tooltip@^5.0.1, rc-tooltip@~5.1.1: + version "5.1.1" + resolved "https://registry.yarnpkg.com/rc-tooltip/-/rc-tooltip-5.1.1.tgz#94178ed162d0252bc4993b725f5dc2ac0fccf154" + integrity sha512-alt8eGMJulio6+4/uDm7nvV+rJq9bsfxFDCI0ljPdbuoygUscbsMYb6EQgwib/uqsXQUvzk+S7A59uYHmEgmDA== + dependencies: + "@babel/runtime" "^7.11.2" + rc-trigger "^5.0.0" + +rc-tree-select@~5.1.1: + version "5.1.4" + resolved "https://registry.yarnpkg.com/rc-tree-select/-/rc-tree-select-5.1.4.tgz#3577135399d1f4931b0f4d8245e0845861802e2b" + integrity sha512-sA6vTUQghzbjh3u6YAwJIebKkJEHUWDPFHQpfiPObqsEYqi9TKE1LvWqbJ77NbOlOARZq0KIb7LDGF8X0dikDQ== + dependencies: + "@babel/runtime" "^7.10.1" + classnames "2.x" + rc-select "~14.0.0-alpha.8" + rc-tree "~5.4.3" + rc-util "^5.16.1" + +rc-tree@~5.4.3: + version "5.4.4" + resolved "https://registry.yarnpkg.com/rc-tree/-/rc-tree-5.4.4.tgz#2ea3663ad3c566aef79a46ba6a1e050d24323e01" + integrity sha512-2qoObRgp31DBXmVzMJmo4qmwP20XEa4hR3imWQtRPcgN3pmljW3WKFmZRrYdOFHz7CyTnRsFZR065bBkIoUpiA== + dependencies: + "@babel/runtime" "^7.10.1" + classnames "2.x" + rc-motion "^2.0.1" + rc-util "^5.16.1" + rc-virtual-list "^3.4.2" + +rc-trigger@^5.0.0, rc-trigger@^5.0.4, rc-trigger@^5.1.2, rc-trigger@^5.2.10: + version "5.2.10" + resolved "https://registry.yarnpkg.com/rc-trigger/-/rc-trigger-5.2.10.tgz#8a0057a940b1b9027eaa33beec8a6ecd85cce2b1" + integrity sha512-FkUf4H9BOFDaIwu42fvRycXMAvkttph9AlbCZXssZDVzz2L+QZ0ERvfB/4nX3ZFPh1Zd+uVGr1DEDeXxq4J1TA== + dependencies: + "@babel/runtime" "^7.11.2" + classnames "^2.2.6" + rc-align "^4.0.0" + rc-motion "^2.0.0" + rc-util "^5.5.0" + +rc-upload@~4.3.0: + version "4.3.3" + resolved "https://registry.yarnpkg.com/rc-upload/-/rc-upload-4.3.3.tgz#e237aa525e5313fa16f4d04d27f53c2f0e157bb8" + integrity sha512-YoJ0phCRenMj1nzwalXzciKZ9/FAaCrFu84dS5pphwucTC8GUWClcDID/WWNGsLFcM97NqIboDqrV82rVRhW/w== + dependencies: + "@babel/runtime" "^7.10.1" + classnames "^2.2.5" + rc-util "^5.2.0" + +rc-util@^5.0.1, rc-util@^5.0.6, rc-util@^5.0.7, rc-util@^5.12.0, rc-util@^5.14.0, rc-util@^5.15.0, rc-util@^5.16.1, rc-util@^5.18.1, rc-util@^5.2.0, rc-util@^5.2.1, rc-util@^5.3.0, rc-util@^5.4.0, rc-util@^5.5.0, rc-util@^5.6.1, rc-util@^5.7.0, rc-util@^5.8.0, rc-util@^5.9.4, rc-util@^5.9.8: + version "5.18.1" + resolved "https://registry.yarnpkg.com/rc-util/-/rc-util-5.18.1.tgz#80bd1450b5254655d2fbea63e3d34f6871e9be79" + integrity sha512-24xaSrMZUEKh1+suDOtJWfPe9E6YrwryViZcoPO0miJTKzP4qhUlV5AAlKQ82AJilz/AOHfi3l6HoX8qa1ye8w== + dependencies: + "@babel/runtime" "^7.12.5" + react-is "^16.12.0" + shallowequal "^1.1.0" + +rc-virtual-list@^3.2.0, rc-virtual-list@^3.4.2: + version "3.4.2" + resolved "https://registry.yarnpkg.com/rc-virtual-list/-/rc-virtual-list-3.4.2.tgz#1078327aa7230b5e456d679ed2ce99f3c036ebd1" + integrity sha512-OyVrrPvvFcHvV0ssz5EDZ+7Rf5qLat/+mmujjchNw5FfbJWNDwkpQ99EcVE6+FtNRmX9wFa1LGNpZLUTvp/4GQ== + dependencies: + classnames "^2.2.6" + rc-resize-observer "^1.0.0" + rc-util "^5.0.7" + +react-dom@^16.13.1: + version "16.14.0" + resolved "https://registry.yarnpkg.com/react-dom/-/react-dom-16.14.0.tgz#7ad838ec29a777fb3c75c3a190f661cf92ab8b89" + integrity sha512-1gCeQXDLoIqMgqD3IO2Ah9bnf0w9kzhwN5q4FGnHZ67hBm9yePzB5JJAIQCc8x3pFnNlwFq4RidZggNAAkzWWw== + dependencies: + loose-envify "^1.1.0" + object-assign "^4.1.1" + prop-types "^15.6.2" + scheduler "^0.19.1" + +react-flame-graph@^1.4.0: + version "1.4.0" + resolved "https://registry.yarnpkg.com/react-flame-graph/-/react-flame-graph-1.4.0.tgz#52d118cc94348f630a812fc0ec530a5b73c30cdb" + integrity sha512-DaCK9ZX+xK0mNca72kUE5cu6T8hGe/KLsefQWf+eT9sVt+0WP1dVxZCGD8Svfn2KrZB9Mv011Intg/yG2YWSxA== + dependencies: + flow-bin "^0.118.0" + memoize-one "^3.1.1" + react-window "^1" + +react-is@^16.12.0, react-is@^16.13.1, react-is@^16.7.0: + version "16.13.1" + resolved "https://registry.yarnpkg.com/react-is/-/react-is-16.13.1.tgz#789729a4dc36de2999dc156dd6c1d9c18cea56a4" + integrity sha512-24e6ynE2H+OKt4kqsOvNd8kBpV65zoxbA4BVsEOB3ARVWQki/DHzaUoC5KuON/BiccDaCCTZBuOcfZs70kR8bQ== + +"react-is@^16.8.0 || ^17.0.0": + version "17.0.2" + resolved "https://registry.yarnpkg.com/react-is/-/react-is-17.0.2.tgz#e691d4a8e9c789365655539ab372762b0efb54f0" + integrity sha512-w2GsyukL62IJnlaff/nRegPQR94C/XXamvMWmSHRJ4y7Ts/4ocGRmTHvOs8PSE6pB3dWOrD/nueuU5sduBsQ4w== + +react-transition-group@^4.4.0: + version "4.4.2" + resolved "https://registry.yarnpkg.com/react-transition-group/-/react-transition-group-4.4.2.tgz#8b59a56f09ced7b55cbd53c36768b922890d5470" + integrity sha512-/RNYfRAMlZwDSr6z4zNKV6xu53/e2BuaBbGhbyYIXTrmgu/bGHzmqOs7mJSJBHy9Ud+ApHx3QjrkKSp1pxvlFg== + dependencies: + "@babel/runtime" "^7.5.5" + dom-helpers "^5.0.1" + loose-envify "^1.4.0" + prop-types "^15.6.2" + +react-window@^1: + version "1.8.6" + resolved "https://registry.yarnpkg.com/react-window/-/react-window-1.8.6.tgz#d011950ac643a994118632665aad0c6382e2a112" + integrity sha512-8VwEEYyjz6DCnGBsd+MgkD0KJ2/OXFULyDtorIiTz+QzwoP94tBoA7CnbtyXMm+cCeAUER5KJcPtWl9cpKbOBg== + dependencies: + "@babel/runtime" "^7.0.0" + memoize-one ">=3.1.1 <6" + +react@^16.13.1: + version "16.14.0" + resolved "https://registry.yarnpkg.com/react/-/react-16.14.0.tgz#94d776ddd0aaa37da3eda8fc5b6b18a4c9a3114d" + integrity sha512-0X2CImDkJGApiAlcf0ODKIneSwBPhqJawOa5wCtKbu7ZECrmS26NvtSILynQ66cgkT/RJ4LidJOc3bUESwmU8g== + dependencies: + loose-envify "^1.1.0" + object-assign "^4.1.1" + prop-types "^15.6.2" + +readable-stream@^2.0.1: + version "2.3.7" + resolved "https://registry.yarnpkg.com/readable-stream/-/readable-stream-2.3.7.tgz#1eca1cf711aef814c04f62252a36a62f6cb23b57" + integrity sha512-Ebho8K4jIbHAxnuxi7o42OrZgF/ZTNcsZj6nRKyUmkhLFq8CHItp/fy6hQZuZmP/n3yZ9VBUbp4zz/mX8hmYPw== + dependencies: + core-util-is "~1.0.0" + inherits "~2.0.3" + isarray "~1.0.0" + process-nextick-args "~2.0.0" + safe-buffer "~5.1.1" + string_decoder "~1.1.1" + util-deprecate "~1.0.1" + +readable-stream@^3.0.6: + version "3.6.0" + resolved "https://registry.yarnpkg.com/readable-stream/-/readable-stream-3.6.0.tgz#337bbda3adc0706bd3e024426a286d4b4b2c9198" + integrity sha512-BViHy7LKeTz4oNnkcLJ+lVSL6vpiFeX6/d3oSH8zCW7UxP2onchk+vTGB143xuFjHS3deTgkKoXXymXqymiIdA== + dependencies: + inherits "^2.0.3" + string_decoder "^1.1.1" + util-deprecate "^1.0.1" + +readdirp@~3.6.0: + version "3.6.0" + resolved "https://registry.yarnpkg.com/readdirp/-/readdirp-3.6.0.tgz#74a370bd857116e245b29cc97340cd431a02a6c7" + integrity sha512-hOS089on8RduqdbhvQ5Z37A0ESjsqz6qnRcffsMU3495FuTdqSm+7bhJ29JvIOsBDEEnan5DPu9t3To9VRlMzA== + dependencies: + picomatch "^2.2.1" + +rechoir@^0.7.0: + version "0.7.1" + resolved "https://registry.yarnpkg.com/rechoir/-/rechoir-0.7.1.tgz#9478a96a1ca135b5e88fc027f03ee92d6c645686" + integrity sha512-/njmZ8s1wVeR6pjTZ+0nCnv8SpZNRMT2D1RLOJQESlYFDBvwpTA4KWJpZ+sBJ4+vhjILRcK7JIFdGCdxEAAitg== + dependencies: + resolve "^1.9.0" + +regenerator-runtime@^0.13.4: + version "0.13.9" + resolved "https://registry.yarnpkg.com/regenerator-runtime/-/regenerator-runtime-0.13.9.tgz#8925742a98ffd90814988d7566ad30ca3b263b52" + integrity sha512-p3VT+cOEgxFsRRA9X4lkI1E+k2/CtnKtU4gcxyaCUreilL/vqI6CdZ3wxVUx3UOUg+gnUOQQcRI7BmSI656MYA== + +regexp.prototype.flags@^1.2.0: + version "1.4.1" + resolved "https://registry.yarnpkg.com/regexp.prototype.flags/-/regexp.prototype.flags-1.4.1.tgz#b3f4c0059af9e47eca9f3f660e51d81307e72307" + integrity sha512-pMR7hBVUUGI7PMA37m2ofIdQCsomVnas+Jn5UPGAHQ+/LlwKm/aTLJHdasmHRzlfeZwHiAOaRSo2rbBDm3nNUQ== + dependencies: + call-bind "^1.0.2" + define-properties "^1.1.3" + +relateurl@^0.2.7: + version "0.2.7" + resolved "https://registry.yarnpkg.com/relateurl/-/relateurl-0.2.7.tgz#54dbf377e51440aca90a4cd274600d3ff2d888a9" + integrity sha1-VNvzd+UUQKypCkzSdGANP/LYiKk= + +renderkid@^3.0.0: + version "3.0.0" + resolved "https://registry.yarnpkg.com/renderkid/-/renderkid-3.0.0.tgz#5fd823e4d6951d37358ecc9a58b1f06836b6268a" + integrity sha512-q/7VIQA8lmM1hF+jn+sFSPWGlMkSAeNYcPLmDQx2zzuiDfaLrOmumR8iaUKlenFgh0XRPIUeSPlH3A+AW3Z5pg== + dependencies: + css-select "^4.1.3" + dom-converter "^0.2.0" + htmlparser2 "^6.1.0" + lodash "^4.17.21" + strip-ansi "^6.0.1" + +require-from-string@^2.0.2: + version "2.0.2" + resolved "https://registry.yarnpkg.com/require-from-string/-/require-from-string-2.0.2.tgz#89a7fdd938261267318eafe14f9c32e598c36909" + integrity sha512-Xf0nWe6RseziFMu+Ap9biiUbmplq6S9/p+7w7YXP/JBHhrUDDUhwa+vANyubuqfZWTveU//DYVGsDG7RKL/vEw== + +requires-port@^1.0.0: + version "1.0.0" + resolved "https://registry.yarnpkg.com/requires-port/-/requires-port-1.0.0.tgz#925d2601d39ac485e091cf0da5c6e694dc3dcaff" + integrity sha1-kl0mAdOaxIXgkc8NpcbmlNw9yv8= + +resize-observer-polyfill@^1.5.0, resize-observer-polyfill@^1.5.1: + version "1.5.1" + resolved "https://registry.yarnpkg.com/resize-observer-polyfill/-/resize-observer-polyfill-1.5.1.tgz#0e9020dd3d21024458d4ebd27e23e40269810464" + integrity sha512-LwZrotdHOo12nQuZlHEmtuXdqGoOD0OhaxopaNFxWzInpEgaLWoVuAMbTzixuosCx2nEG58ngzW3vxdWoxIgdg== + +resolve-cwd@^3.0.0: + version "3.0.0" + resolved "https://registry.yarnpkg.com/resolve-cwd/-/resolve-cwd-3.0.0.tgz#0f0075f1bb2544766cf73ba6a6e2adfebcb13f2d" + integrity sha512-OrZaX2Mb+rJCpH/6CpSqt9xFVpN++x01XnN2ie9g6P5/3xelLAkXWVADpdz1IHD/KFfEXyE6V0U01OQ3UO2rEg== + dependencies: + resolve-from "^5.0.0" + +resolve-from@^5.0.0: + version "5.0.0" + resolved "https://registry.yarnpkg.com/resolve-from/-/resolve-from-5.0.0.tgz#c35225843df8f776df21c57557bc087e9dfdfc69" + integrity sha512-qYg9KP24dD5qka9J47d0aVky0N+b4fTU89LN9iDnjB5waksiC49rvMB0PrUJQGoTmH50XPiqOvAjDfaijGxYZw== + +resolve@^1.9.0: + version "1.22.0" + resolved "https://registry.yarnpkg.com/resolve/-/resolve-1.22.0.tgz#5e0b8c67c15df57a89bdbabe603a002f21731198" + integrity sha512-Hhtrw0nLeSrFQ7phPp4OOcVjLPIeMnRlr5mcnVuMe7M/7eBn98A3hmFRLoFo3DLZkivSYwhRUJTyPyWAk56WLw== + dependencies: + is-core-module "^2.8.1" + path-parse "^1.0.7" + supports-preserve-symlinks-flag "^1.0.0" + +retry@^0.13.1: + version "0.13.1" + resolved "https://registry.yarnpkg.com/retry/-/retry-0.13.1.tgz#185b1587acf67919d63b357349e03537b2484658" + integrity sha512-XQBQ3I8W1Cge0Seh+6gjj03LbmRFWuoszgK9ooCpwYIrhhoO80pfq4cUkU5DkknwfOfFteRwlZ56PYOGYyFWdg== + +reusify@^1.0.4: + version "1.0.4" + resolved "https://registry.yarnpkg.com/reusify/-/reusify-1.0.4.tgz#90da382b1e126efc02146e90845a88db12925d76" + integrity sha512-U9nH88a3fc/ekCF1l0/UP1IosiuIjyTh7hBvXVMHYgVcfGvt897Xguj2UOLDeI5BG2m7/uwyaLVT6fbtCwTyzw== + +rimraf@^3.0.2: + version "3.0.2" + resolved "https://registry.yarnpkg.com/rimraf/-/rimraf-3.0.2.tgz#f1a5402ba6220ad52cc1282bac1ae3aa49fd061a" + integrity sha512-JZkJMZkAGFFPP2YqXZXPbMlMBgsxzE8ILs4lMIX/2o0L9UBw9O/Y3o6wFw/i9YLapcUJWwqbi3kdxIPdC62TIA== + dependencies: + glob "^7.1.3" + +run-parallel@^1.1.9: + version "1.2.0" + resolved "https://registry.yarnpkg.com/run-parallel/-/run-parallel-1.2.0.tgz#66d1368da7bdf921eb9d95bd1a9229e7f21a43ee" + integrity sha512-5l4VyZR86LZ/lDxZTR6jqL8AFE2S0IFLMP26AbjsLVADxHdhB/c0GUsH+y39UfCi3dzz8OlQuPmnaJOMoDHQBA== + dependencies: + queue-microtask "^1.2.2" + +safe-buffer@5.1.2, safe-buffer@~5.1.0, safe-buffer@~5.1.1: + version "5.1.2" + resolved "https://registry.yarnpkg.com/safe-buffer/-/safe-buffer-5.1.2.tgz#991ec69d296e0313747d59bdfd2b745c35f8828d" + integrity sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g== + +safe-buffer@5.2.1, safe-buffer@>=5.1.0, safe-buffer@^5.0.1, safe-buffer@^5.1.0, safe-buffer@~5.2.0: + version "5.2.1" + resolved "https://registry.yarnpkg.com/safe-buffer/-/safe-buffer-5.2.1.tgz#1eaf9fa9bdb1fdd4ec75f58f9cdb4e6b7827eec6" + integrity sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ== + +"safer-buffer@>= 2.1.2 < 3": + version "2.1.2" + resolved "https://registry.yarnpkg.com/safer-buffer/-/safer-buffer-2.1.2.tgz#44fa161b0187b9549dd84bb91802f9bd8385cd6a" + integrity sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg== + +scheduler@^0.19.1: + version "0.19.1" + resolved "https://registry.yarnpkg.com/scheduler/-/scheduler-0.19.1.tgz#4f3e2ed2c1a7d65681f4c854fa8c5a1ccb40f196" + integrity sha512-n/zwRWRYSUj0/3g/otKDRPMh6qv2SYMWNq85IEa8iZyAv8od9zDYpGSnpBEjNgcMNq6Scbu5KfIPxNF72R/2EA== + dependencies: + loose-envify "^1.1.0" + object-assign "^4.1.1" + +schema-utils@^3.0.0, schema-utils@^3.1.0, schema-utils@^3.1.1: + version "3.1.1" + resolved "https://registry.yarnpkg.com/schema-utils/-/schema-utils-3.1.1.tgz#bc74c4b6b6995c1d88f76a8b77bea7219e0c8281" + integrity sha512-Y5PQxS4ITlC+EahLuXaY86TXfR7Dc5lw294alXOq86JAHCihAIZfqv8nNCWvaEJvaC51uN9hbLGeV0cFBdH+Fw== + dependencies: + "@types/json-schema" "^7.0.8" + ajv "^6.12.5" + ajv-keywords "^3.5.2" + +schema-utils@^4.0.0: + version "4.0.0" + resolved "https://registry.yarnpkg.com/schema-utils/-/schema-utils-4.0.0.tgz#60331e9e3ae78ec5d16353c467c34b3a0a1d3df7" + integrity sha512-1edyXKgh6XnJsJSQ8mKWXnN/BVaIbFMLpouRUrXgVq7WYne5kw3MW7UPhO44uRXQSIpTSXoJbmrR2X0w9kUTyg== + dependencies: + "@types/json-schema" "^7.0.9" + ajv "^8.8.0" + ajv-formats "^2.1.1" + ajv-keywords "^5.0.0" + +scroll-into-view-if-needed@^2.2.25: + version "2.2.29" + resolved "https://registry.yarnpkg.com/scroll-into-view-if-needed/-/scroll-into-view-if-needed-2.2.29.tgz#551791a84b7e2287706511f8c68161e4990ab885" + integrity sha512-hxpAR6AN+Gh53AdAimHM6C8oTN1ppwVZITihix+WqalywBeFcQ6LdQP5ABNl26nX8GTEL7VT+b8lKpdqq65wXg== + dependencies: + compute-scroll-into-view "^1.0.17" + +select-hose@^2.0.0: + version "2.0.0" + resolved "https://registry.yarnpkg.com/select-hose/-/select-hose-2.0.0.tgz#625d8658f865af43ec962bfc376a37359a4994ca" + integrity sha1-Yl2GWPhlr0Psliv8N2o3NZpJlMo= + +selfsigned@^2.0.0: + version "2.0.0" + resolved "https://registry.yarnpkg.com/selfsigned/-/selfsigned-2.0.0.tgz#e927cd5377cbb0a1075302cff8df1042cc2bce5b" + integrity sha512-cUdFiCbKoa1mZ6osuJs2uDHrs0k0oprsKveFiiaBKCNq3SYyb5gs2HxhQyDNLCmL51ZZThqi4YNDpCK6GOP1iQ== + dependencies: + node-forge "^1.2.0" + +semver@^7.3.4, semver@^7.3.5: + version "7.3.5" + resolved "https://registry.yarnpkg.com/semver/-/semver-7.3.5.tgz#0b621c879348d8998e4b0e4be94b3f12e6018ef7" + integrity sha512-PoeGJYh8HK4BTO/a9Tf6ZG3veo/A7ZVsYrSA6J8ny9nb3B1VrpkuN+z9OE5wfE5p6H4LchYZsegiQgbJD94ZFQ== + dependencies: + lru-cache "^6.0.0" + +send@0.17.2: + version "0.17.2" + resolved "https://registry.yarnpkg.com/send/-/send-0.17.2.tgz#926622f76601c41808012c8bf1688fe3906f7820" + integrity sha512-UJYB6wFSJE3G00nEivR5rgWp8c2xXvJ3OPWPhmuteU0IKj8nKbG3DrjiOmLwpnHGYWAVwA69zmTm++YG0Hmwww== + dependencies: + debug "2.6.9" + depd "~1.1.2" + destroy "~1.0.4" + encodeurl "~1.0.2" + escape-html "~1.0.3" + etag "~1.8.1" + fresh "0.5.2" + http-errors "1.8.1" + mime "1.6.0" + ms "2.1.3" + on-finished "~2.3.0" + range-parser "~1.2.1" + statuses "~1.5.0" + +serialize-javascript@^6.0.0: + version "6.0.0" + resolved "https://registry.yarnpkg.com/serialize-javascript/-/serialize-javascript-6.0.0.tgz#efae5d88f45d7924141da8b5c3a7a7e663fefeb8" + integrity sha512-Qr3TosvguFt8ePWqsvRfrKyQXIiW+nGbYpy8XK24NQHE83caxWt+mIymTT19DGFbNWNLfEwsrkSmN64lVWB9ag== + dependencies: + randombytes "^2.1.0" + +serve-index@^1.9.1: + version "1.9.1" + resolved "https://registry.yarnpkg.com/serve-index/-/serve-index-1.9.1.tgz#d3768d69b1e7d82e5ce050fff5b453bea12a9239" + integrity sha1-03aNabHn2C5c4FD/9bRTvqEqkjk= + dependencies: + accepts "~1.3.4" + batch "0.6.1" + debug "2.6.9" + escape-html "~1.0.3" + http-errors "~1.6.2" + mime-types "~2.1.17" + parseurl "~1.3.2" + +serve-static@1.14.2: + version "1.14.2" + resolved "https://registry.yarnpkg.com/serve-static/-/serve-static-1.14.2.tgz#722d6294b1d62626d41b43a013ece4598d292bfa" + integrity sha512-+TMNA9AFxUEGuC0z2mevogSnn9MXKb4fa7ngeRMJaaGv8vTwnIEkKi+QGvPt33HSnf8pRS+WGM0EbMtCJLKMBQ== + dependencies: + encodeurl "~1.0.2" + escape-html "~1.0.3" + parseurl "~1.3.3" + send "0.17.2" + +setprototypeof@1.1.0: + version "1.1.0" + resolved "https://registry.yarnpkg.com/setprototypeof/-/setprototypeof-1.1.0.tgz#d0bd85536887b6fe7c0d818cb962d9d91c54e656" + integrity sha512-BvE/TwpZX4FXExxOxZyRGQQv651MSwmWKZGqvmPcRIjDqWub67kTKuIMx43cZZrS/cBBzwBcNDWoFxt2XEFIpQ== + +setprototypeof@1.2.0: + version "1.2.0" + resolved "https://registry.yarnpkg.com/setprototypeof/-/setprototypeof-1.2.0.tgz#66c9a24a73f9fc28cbe66b09fed3d33dcaf1b424" + integrity sha512-E5LDX7Wrp85Kil5bhZv46j8jOeboKq5JMmYM3gVGdGH8xFpPWXUMsNrlODCrkoxMEeNi/XZIwuRvY4XNwYMJpw== + +shallow-clone@^3.0.0: + version "3.0.1" + resolved "https://registry.yarnpkg.com/shallow-clone/-/shallow-clone-3.0.1.tgz#8f2981ad92531f55035b01fb230769a40e02efa3" + integrity sha512-/6KqX+GVUdqPuPPd2LxDDxzX6CAbjJehAAOKlNpqqUpAqPM6HeL8f+o3a+JsyGjn2lv0WY8UsTgUJjU9Ok55NA== + dependencies: + kind-of "^6.0.2" + +shallowequal@^1.1.0: + version "1.1.0" + resolved "https://registry.yarnpkg.com/shallowequal/-/shallowequal-1.1.0.tgz#188d521de95b9087404fd4dcb68b13df0ae4e7f8" + integrity sha512-y0m1JoUZSlPAjXVtPPW70aZWfIL/dSP7AFkRnniLCrK/8MDKog3TySTBmckD+RObVxH0v4Tox67+F14PdED2oQ== + +shebang-command@^2.0.0: + version "2.0.0" + resolved "https://registry.yarnpkg.com/shebang-command/-/shebang-command-2.0.0.tgz#ccd0af4f8835fbdc265b82461aaf0c36663f34ea" + integrity sha512-kHxr2zZpYtdmrN1qDjrrX/Z1rR1kG8Dx+gkpK1G4eXmvXswmcE1hTWBWYUzlraYw1/yZp6YuDY77YtvbN0dmDA== + dependencies: + shebang-regex "^3.0.0" + +shebang-regex@^3.0.0: + version "3.0.0" + resolved "https://registry.yarnpkg.com/shebang-regex/-/shebang-regex-3.0.0.tgz#ae16f1644d873ecad843b0307b143362d4c42172" + integrity sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A== + +signal-exit@^3.0.3: + version "3.0.7" + resolved "https://registry.yarnpkg.com/signal-exit/-/signal-exit-3.0.7.tgz#a9a1767f8af84155114eaabd73f99273c8f59ad9" + integrity sha512-wnD2ZE+l+SPC/uoS0vXeE9L1+0wuaMqKlfz9AMUo38JsyLSBWSFcHR1Rri62LZc12vLr1gb3jl7iwQhgwpAbGQ== + +slash@^3.0.0: + version "3.0.0" + resolved "https://registry.yarnpkg.com/slash/-/slash-3.0.0.tgz#6539be870c165adbd5240220dbe361f1bc4d4634" + integrity sha512-g9Q1haeby36OSStwb4ntCGGGaKsaVSjQ68fBxoQcutl5fS1vuY18H3wSt3jFyFtrkx+Kz0V1G85A4MyAdDMi2Q== + +sockjs@^0.3.21: + version "0.3.24" + resolved "https://registry.yarnpkg.com/sockjs/-/sockjs-0.3.24.tgz#c9bc8995f33a111bea0395ec30aa3206bdb5ccce" + integrity sha512-GJgLTZ7vYb/JtPSSZ10hsOYIvEYsjbNU+zPdIHcUaWVNUEPivzxku31865sSSud0Da0W4lEeOPlmw93zLQchuQ== + dependencies: + faye-websocket "^0.11.3" + uuid "^8.3.2" + websocket-driver "^0.7.4" + +source-map-js@^1.0.2: + version "1.0.2" + resolved "https://registry.yarnpkg.com/source-map-js/-/source-map-js-1.0.2.tgz#adbc361d9c62df380125e7f161f71c826f1e490c" + integrity sha512-R0XvVJ9WusLiqTCEiGCmICCMplcCkIwwR11mOSD9CR5u+IXYdiseeEuXCVAjS54zqwkLcPNnmU4OeJ6tUrWhDw== + +source-map-support@~0.5.20: + version "0.5.21" + resolved "https://registry.yarnpkg.com/source-map-support/-/source-map-support-0.5.21.tgz#04fe7c7f9e1ed2d662233c28cb2b35b9f63f6e4f" + integrity sha512-uBHU3L3czsIyYXKX88fdrGovxdSCoTGDRZ6SYXtSRxLZUzHg5P/66Ht6uoUlHu9EZod+inXhKo3qQgwXUT/y1w== + dependencies: + buffer-from "^1.0.0" + source-map "^0.6.0" + +source-map@^0.6.0, source-map@^0.6.1, source-map@~0.6.0: + version "0.6.1" + resolved "https://registry.yarnpkg.com/source-map/-/source-map-0.6.1.tgz#74722af32e9614e9c287a8d0bbde48b5e2f1a263" + integrity sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g== + +source-map@~0.7.2: + version "0.7.3" + resolved "https://registry.yarnpkg.com/source-map/-/source-map-0.7.3.tgz#5302f8169031735226544092e64981f751750383" + integrity sha512-CkCj6giN3S+n9qrYiBTX5gystlENnRW5jZeNLHpe6aue+SrHcG5VYwujhW9s4dY31mEGsxBDrHR6oI69fTXsaQ== + +spdy-transport@^3.0.0: + version "3.0.0" + resolved "https://registry.yarnpkg.com/spdy-transport/-/spdy-transport-3.0.0.tgz#00d4863a6400ad75df93361a1608605e5dcdcf31" + integrity sha512-hsLVFE5SjA6TCisWeJXFKniGGOpBgMLmerfO2aCyCU5s7nJ/rpAepqmFifv/GCbSbueEeAJJnmSQ2rKC/g8Fcw== + dependencies: + debug "^4.1.0" + detect-node "^2.0.4" + hpack.js "^2.1.6" + obuf "^1.1.2" + readable-stream "^3.0.6" + wbuf "^1.7.3" + +spdy@^4.0.2: + version "4.0.2" + resolved "https://registry.yarnpkg.com/spdy/-/spdy-4.0.2.tgz#b74f466203a3eda452c02492b91fb9e84a27677b" + integrity sha512-r46gZQZQV+Kl9oItvl1JZZqJKGr+oEkB08A6BzkiR7593/7IbtuncXHd2YoYeTsG4157ZssMu9KYvUHLcjcDoA== + dependencies: + debug "^4.1.0" + handle-thing "^2.0.0" + http-deceiver "^1.2.7" + select-hose "^2.0.0" + spdy-transport "^3.0.0" + +"statuses@>= 1.4.0 < 2", "statuses@>= 1.5.0 < 2", statuses@~1.5.0: + version "1.5.0" + resolved "https://registry.yarnpkg.com/statuses/-/statuses-1.5.0.tgz#161c7dac177659fd9811f43771fa99381478628c" + integrity sha1-Fhx9rBd2Wf2YEfQ3cfqZOBR4Yow= + +string-convert@^0.2.0: + version "0.2.1" + resolved "https://registry.yarnpkg.com/string-convert/-/string-convert-0.2.1.tgz#6982cc3049fbb4cd85f8b24568b9d9bf39eeff97" + integrity sha1-aYLMMEn7tM2F+LJFaLnZvznu/5c= + +string_decoder@^1.1.1: + version "1.3.0" + resolved "https://registry.yarnpkg.com/string_decoder/-/string_decoder-1.3.0.tgz#42f114594a46cf1a8e30b0a84f56c78c3edac21e" + integrity sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA== + dependencies: + safe-buffer "~5.2.0" + +string_decoder@~1.1.1: + version "1.1.1" + resolved "https://registry.yarnpkg.com/string_decoder/-/string_decoder-1.1.1.tgz#9cf1611ba62685d7030ae9e4ba34149c3af03fc8" + integrity sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg== + dependencies: + safe-buffer "~5.1.0" + +strip-ansi@^6.0.1: + version "6.0.1" + resolved "https://registry.yarnpkg.com/strip-ansi/-/strip-ansi-6.0.1.tgz#9e26c63d30f53443e9489495b2105d37b67a85d9" + integrity sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A== + dependencies: + ansi-regex "^5.0.1" + +strip-ansi@^7.0.0: + version "7.0.1" + resolved "https://registry.yarnpkg.com/strip-ansi/-/strip-ansi-7.0.1.tgz#61740a08ce36b61e50e65653f07060d000975fb2" + integrity sha512-cXNxvT8dFNRVfhVME3JAe98mkXDYN2O1l7jmcwMnOslDeESg1rF/OZMtK0nRAhiari1unG5cD4jG3rapUAkLbw== + dependencies: + ansi-regex "^6.0.1" + +strip-final-newline@^2.0.0: + version "2.0.0" + resolved "https://registry.yarnpkg.com/strip-final-newline/-/strip-final-newline-2.0.0.tgz#89b852fb2fcbe936f6f4b3187afb0a12c1ab58ad" + integrity sha512-BrpvfNAE3dcvq7ll3xVumzjKjZQ5tI1sEUIKr3Uoks0XUl45St3FlatVqef9prk4jRDzhW6WZg+3bk93y6pLjA== + +style-loader@^2.0.0: + version "2.0.0" + resolved "https://registry.yarnpkg.com/style-loader/-/style-loader-2.0.0.tgz#9669602fd4690740eaaec137799a03addbbc393c" + integrity sha512-Z0gYUJmzZ6ZdRUqpg1r8GsaFKypE+3xAzuFeMuoHgjc9KZv3wMyCRjQIWEbhoFSq7+7yoHXySDJyyWQaPajeiQ== + dependencies: + loader-utils "^2.0.0" + schema-utils "^3.0.0" + +supports-color@^7.1.0: + version "7.2.0" + resolved "https://registry.yarnpkg.com/supports-color/-/supports-color-7.2.0.tgz#1b7dcdcb32b8138801b3e478ba6a51caa89648da" + integrity sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw== + dependencies: + has-flag "^4.0.0" + +supports-color@^8.0.0: + version "8.1.1" + resolved "https://registry.yarnpkg.com/supports-color/-/supports-color-8.1.1.tgz#cd6fc17e28500cff56c1b86c0a7fd4a54a73005c" + integrity sha512-MpUEN2OodtUzxvKQl72cUF7RQ5EiHsGvSsVG0ia9c5RbWGL2CI4C7EpPS8UTBIplnlzZiNuV56w+FuNxy3ty2Q== + dependencies: + has-flag "^4.0.0" + +supports-preserve-symlinks-flag@^1.0.0: + version "1.0.0" + resolved "https://registry.yarnpkg.com/supports-preserve-symlinks-flag/-/supports-preserve-symlinks-flag-1.0.0.tgz#6eda4bd344a3c94aea376d4cc31bc77311039e09" + integrity sha512-ot0WnXS9fgdkgIcePe6RHNk1WA8+muPa6cSjeR3V8K27q9BB1rTE3R1p7Hv0z1ZyAc8s6Vvv8DIyWf681MAt0w== + +tapable@^1.0.0: + version "1.1.3" + resolved "https://registry.yarnpkg.com/tapable/-/tapable-1.1.3.tgz#a1fccc06b58db61fd7a45da2da44f5f3a3e67ba2" + integrity sha512-4WK/bYZmj8xLr+HUCODHGF1ZFzsYffasLUgEiMBY4fgtltdO6B4WJtlSbPaDTLpYTcGVwM2qLnFTICEcNxs3kA== + +tapable@^2.0.0, tapable@^2.1.1, tapable@^2.2.0: + version "2.2.1" + resolved "https://registry.yarnpkg.com/tapable/-/tapable-2.2.1.tgz#1967a73ef4060a82f12ab96af86d52fdb76eeca0" + integrity sha512-GNzQvQTOIP6RyTfE2Qxb8ZVlNmw0n88vp1szwWRimP02mnTsx3Wtn5qRdqY9w2XduFNUgvOwhNnQsjwCp+kqaQ== + +terser-webpack-plugin@^5.1.3: + version "5.3.1" + resolved "https://registry.yarnpkg.com/terser-webpack-plugin/-/terser-webpack-plugin-5.3.1.tgz#0320dcc270ad5372c1e8993fabbd927929773e54" + integrity sha512-GvlZdT6wPQKbDNW/GDQzZFg/j4vKU96yl2q6mcUkzKOgW4gwf1Z8cZToUCrz31XHlPWH8MVb1r2tFtdDtTGJ7g== + dependencies: + jest-worker "^27.4.5" + schema-utils "^3.1.1" + serialize-javascript "^6.0.0" + source-map "^0.6.1" + terser "^5.7.2" + +terser@^5.10.0, terser@^5.7.2: + version "5.12.0" + resolved "https://registry.yarnpkg.com/terser/-/terser-5.12.0.tgz#728c6bff05f7d1dcb687d8eace0644802a9dae8a" + integrity sha512-R3AUhNBGWiFc77HXag+1fXpAxTAFRQTJemlJKjAgD9r8xXTpjNKqIXwHM/o7Rh+O0kUJtS3WQVdBeMKFk5sw9A== + dependencies: + acorn "^8.5.0" + commander "^2.20.0" + source-map "~0.7.2" + source-map-support "~0.5.20" + +thunky@^1.0.2: + version "1.1.0" + resolved "https://registry.yarnpkg.com/thunky/-/thunky-1.1.0.tgz#5abaf714a9405db0504732bbccd2cedd9ef9537d" + integrity sha512-eHY7nBftgThBqOyHGVN+l8gF0BucP09fMo0oO/Lb0w1OF80dJv+lDVpXG60WMQvkcxAkNybKsrEIE3ZtKGmPrA== + +tiny-warning@^1.0.2: + version "1.0.3" + resolved "https://registry.yarnpkg.com/tiny-warning/-/tiny-warning-1.0.3.tgz#94a30db453df4c643d0fd566060d60a875d84754" + integrity sha512-lBN9zLN/oAf68o3zNXYrdCt1kP8WsiGW8Oo2ka41b2IM5JL/S1CTyX1rW0mb/zSuJun0ZUrDxx4sqvYS2FWzPA== + +to-regex-range@^5.0.1: + version "5.0.1" + resolved "https://registry.yarnpkg.com/to-regex-range/-/to-regex-range-5.0.1.tgz#1648c44aae7c8d988a326018ed72f5b4dd0392e4" + integrity sha512-65P7iz6X5yEr1cwcgvQxbbIw7Uk3gOy5dIdtZ4rDveLqhrdJP+Li/Hx6tyK0NEb+2GCyneCMJiGqrADCSNk8sQ== + dependencies: + is-number "^7.0.0" + +toggle-selection@^1.0.6: + version "1.0.6" + resolved "https://registry.yarnpkg.com/toggle-selection/-/toggle-selection-1.0.6.tgz#6e45b1263f2017fa0acc7d89d78b15b8bf77da32" + integrity sha1-bkWxJj8gF/oKzH2J14sVuL932jI= + +toidentifier@1.0.1: + version "1.0.1" + resolved "https://registry.yarnpkg.com/toidentifier/-/toidentifier-1.0.1.tgz#3be34321a88a820ed1bd80dfaa33e479fbb8dd35" + integrity sha512-o5sSPKEkg/DIQNmH43V0/uerLrpzVedkUh8tGNvaeXpfpuwjKenlSox/2O/BTlZUtEe+JG7s5YhEz608PlAHRA== + +tr46@~0.0.3: + version "0.0.3" + resolved "https://registry.yarnpkg.com/tr46/-/tr46-0.0.3.tgz#8184fd347dac9cdc185992f3a6622e14b9d9ab6a" + integrity sha1-gYT9NH2snNwYWZLzpmIuFLnZq2o= + +ts-loader@^8.0.18: + version "8.3.0" + resolved "https://registry.yarnpkg.com/ts-loader/-/ts-loader-8.3.0.tgz#83360496d6f8004fab35825279132c93412edf33" + integrity sha512-MgGly4I6cStsJy27ViE32UoqxPTN9Xly4anxxVyaIWR+9BGxboV4EyJBGfR3RePV7Ksjj3rHmPZJeIt+7o4Vag== + dependencies: + chalk "^4.1.0" + enhanced-resolve "^4.0.0" + loader-utils "^2.0.0" + micromatch "^4.0.0" + semver "^7.3.4" + +tslib@^2.0.3: + version "2.3.1" + resolved "https://registry.yarnpkg.com/tslib/-/tslib-2.3.1.tgz#e8a335add5ceae51aa261d32a490158ef042ef01" + integrity sha512-77EbyPPpMz+FRFRuAFlWMtmgUWGe9UOG2Z25NqCwiIjRhOf5iKGuzSe5P2w1laq+FkRy4p+PCuVkJSGkzTEKVw== + +type-is@~1.6.18: + version "1.6.18" + resolved "https://registry.yarnpkg.com/type-is/-/type-is-1.6.18.tgz#4e552cd05df09467dcbc4ef739de89f2cf37c131" + integrity sha512-TkRKr9sUTxEH8MdfuCSP7VizJyzRNMjj2J2do2Jr3Kym598JVdEksuzPQCnlFPW4ky9Q+iA+ma9BGm06XQBy8g== + dependencies: + media-typer "0.3.0" + mime-types "~2.1.24" + +typescript@^4.0.3: + version "4.6.2" + resolved "https://registry.yarnpkg.com/typescript/-/typescript-4.6.2.tgz#fe12d2727b708f4eef40f51598b3398baa9611d4" + integrity sha512-HM/hFigTBHZhLXshn9sN37H085+hQGeJHJ/X7LpBWLID/fbc2acUMfU+lGD98X81sKP+pFa9f0DZmCwB9GnbAg== + +unpipe@1.0.0, unpipe@~1.0.0: + version "1.0.0" + resolved "https://registry.yarnpkg.com/unpipe/-/unpipe-1.0.0.tgz#b2bf4ee8514aae6165b4817829d21b2ef49904ec" + integrity sha1-sr9O6FFKrmFltIF4KdIbLvSZBOw= + +uri-js@^4.2.2: + version "4.4.1" + resolved "https://registry.yarnpkg.com/uri-js/-/uri-js-4.4.1.tgz#9b1a52595225859e55f669d928f88c6c57f2a77e" + integrity sha512-7rKUyy33Q1yc98pQ1DAmLtwX109F7TIfWlW1Ydo8Wl1ii1SeHieeh0HHfPeL2fMXK6z0s8ecKs9frCuLJvndBg== + dependencies: + punycode "^2.1.0" + +util-deprecate@^1.0.1, util-deprecate@^1.0.2, util-deprecate@~1.0.1: + version "1.0.2" + resolved "https://registry.yarnpkg.com/util-deprecate/-/util-deprecate-1.0.2.tgz#450d4dc9fa70de732762fbd2d4a28981419a0ccf" + integrity sha1-RQ1Nyfpw3nMnYvvS1KKJgUGaDM8= + +utila@~0.4: + version "0.4.0" + resolved "https://registry.yarnpkg.com/utila/-/utila-0.4.0.tgz#8a16a05d445657a3aea5eecc5b12a4fa5379772c" + integrity sha1-ihagXURWV6Oupe7MWxKk+lN5dyw= + +utils-merge@1.0.1: + version "1.0.1" + resolved "https://registry.yarnpkg.com/utils-merge/-/utils-merge-1.0.1.tgz#9f95710f50a267947b2ccc124741c1028427e713" + integrity sha1-n5VxD1CiZ5R7LMwSR0HBAoQn5xM= + +uuid@^8.3.2: + version "8.3.2" + resolved "https://registry.yarnpkg.com/uuid/-/uuid-8.3.2.tgz#80d5b5ced271bb9af6c445f21a1a04c606cefbe2" + integrity sha512-+NYs2QeMWy+GWFOEm9xnn6HCDp0l7QBD7ml8zLUmJ+93Q5NF0NocErnwkTkXVFNiX3/fpC6afS8Dhb/gz7R7eg== + +vary@~1.1.2: + version "1.1.2" + resolved "https://registry.yarnpkg.com/vary/-/vary-1.1.2.tgz#2299f02c6ded30d4a5961b0b9f74524a18f634fc" + integrity sha1-IpnwLG3tMNSllhsLn3RSShj2NPw= + +watchpack@^2.3.1: + version "2.3.1" + resolved "https://registry.yarnpkg.com/watchpack/-/watchpack-2.3.1.tgz#4200d9447b401156eeca7767ee610f8809bc9d25" + integrity sha512-x0t0JuydIo8qCNctdDrn1OzH/qDzk2+rdCOC3YzumZ42fiMqmQ7T3xQurykYMhYfHaPHTp4ZxAx2NfUo1K6QaA== + dependencies: + glob-to-regexp "^0.4.1" + graceful-fs "^4.1.2" + +wbuf@^1.1.0, wbuf@^1.7.3: + version "1.7.3" + resolved "https://registry.yarnpkg.com/wbuf/-/wbuf-1.7.3.tgz#c1d8d149316d3ea852848895cb6a0bfe887b87df" + integrity sha512-O84QOnr0icsbFGLS0O3bI5FswxzRr8/gHwWkDlQFskhSPryQXvrTMxjxGP4+iWYoauLoBvfDpkrOauZ+0iZpDA== + dependencies: + minimalistic-assert "^1.0.0" + +webidl-conversions@^3.0.0: + version "3.0.1" + resolved "https://registry.yarnpkg.com/webidl-conversions/-/webidl-conversions-3.0.1.tgz#24534275e2a7bc6be7bc86611cc16ae0a5654871" + integrity sha1-JFNCdeKnvGvnvIZhHMFq4KVlSHE= + +webpack-cli@^4.5.0: + version "4.9.2" + resolved "https://registry.yarnpkg.com/webpack-cli/-/webpack-cli-4.9.2.tgz#77c1adaea020c3f9e2db8aad8ea78d235c83659d" + integrity sha512-m3/AACnBBzK/kMTcxWHcZFPrw/eQuY4Df1TxvIWfWM2x7mRqBQCqKEd96oCUa9jkapLBaFfRce33eGDb4Pr7YQ== + dependencies: + "@discoveryjs/json-ext" "^0.5.0" + "@webpack-cli/configtest" "^1.1.1" + "@webpack-cli/info" "^1.4.1" + "@webpack-cli/serve" "^1.6.1" + colorette "^2.0.14" + commander "^7.0.0" + execa "^5.0.0" + fastest-levenshtein "^1.0.12" + import-local "^3.0.2" + interpret "^2.2.0" + rechoir "^0.7.0" + webpack-merge "^5.7.3" + +webpack-dev-middleware@^5.3.1: + version "5.3.1" + resolved "https://registry.yarnpkg.com/webpack-dev-middleware/-/webpack-dev-middleware-5.3.1.tgz#aa079a8dedd7e58bfeab358a9af7dab304cee57f" + integrity sha512-81EujCKkyles2wphtdrnPg/QqegC/AtqNH//mQkBYSMqwFVCQrxM6ktB2O/SPlZy7LqeEfTbV3cZARGQz6umhg== + dependencies: + colorette "^2.0.10" + memfs "^3.4.1" + mime-types "^2.1.31" + range-parser "^1.2.1" + schema-utils "^4.0.0" + +webpack-dev-server@^4.7.4: + version "4.7.4" + resolved "https://registry.yarnpkg.com/webpack-dev-server/-/webpack-dev-server-4.7.4.tgz#d0ef7da78224578384e795ac228d8efb63d5f945" + integrity sha512-nfdsb02Zi2qzkNmgtZjkrMOcXnYZ6FLKcQwpxT7MvmHKc+oTtDsBju8j+NMyAygZ9GW1jMEUpy3itHtqgEhe1A== + dependencies: + "@types/bonjour" "^3.5.9" + "@types/connect-history-api-fallback" "^1.3.5" + "@types/express" "^4.17.13" + "@types/serve-index" "^1.9.1" + "@types/sockjs" "^0.3.33" + "@types/ws" "^8.2.2" + ansi-html-community "^0.0.8" + bonjour "^3.5.0" + chokidar "^3.5.3" + colorette "^2.0.10" + compression "^1.7.4" + connect-history-api-fallback "^1.6.0" + default-gateway "^6.0.3" + del "^6.0.0" + express "^4.17.1" + graceful-fs "^4.2.6" + html-entities "^2.3.2" + http-proxy-middleware "^2.0.0" + ipaddr.js "^2.0.1" + open "^8.0.9" + p-retry "^4.5.0" + portfinder "^1.0.28" + schema-utils "^4.0.0" + selfsigned "^2.0.0" + serve-index "^1.9.1" + sockjs "^0.3.21" + spdy "^4.0.2" + strip-ansi "^7.0.0" + webpack-dev-middleware "^5.3.1" + ws "^8.4.2" + +webpack-merge@^5.7.3: + version "5.8.0" + resolved "https://registry.yarnpkg.com/webpack-merge/-/webpack-merge-5.8.0.tgz#2b39dbf22af87776ad744c390223731d30a68f61" + integrity sha512-/SaI7xY0831XwP6kzuwhKWVKDP9t1QY1h65lAFLbZqMPIuYcD9QAW4u9STIbU9kaJbPBB/geU/gLr1wDjOhQ+Q== + dependencies: + clone-deep "^4.0.1" + wildcard "^2.0.0" + +webpack-sources@^3.2.3: + version "3.2.3" + resolved "https://registry.yarnpkg.com/webpack-sources/-/webpack-sources-3.2.3.tgz#2d4daab8451fd4b240cc27055ff6a0c2ccea0cde" + integrity sha512-/DyMEOrDgLKKIG0fmvtz+4dUX/3Ghozwgm6iPp8KRhvn+eQf9+Q7GWxVNMk3+uCPWfdXYC4ExGBckIXdFEfH1w== + +webpack@^5.28.0: + version "5.70.0" + resolved "https://registry.yarnpkg.com/webpack/-/webpack-5.70.0.tgz#3461e6287a72b5e6e2f4872700bc8de0d7500e6d" + integrity sha512-ZMWWy8CeuTTjCxbeaQI21xSswseF2oNOwc70QSKNePvmxE7XW36i7vpBMYZFAUHPwQiEbNGCEYIOOlyRbdGmxw== + dependencies: + "@types/eslint-scope" "^3.7.3" + "@types/estree" "^0.0.51" + "@webassemblyjs/ast" "1.11.1" + "@webassemblyjs/wasm-edit" "1.11.1" + "@webassemblyjs/wasm-parser" "1.11.1" + acorn "^8.4.1" + acorn-import-assertions "^1.7.6" + browserslist "^4.14.5" + chrome-trace-event "^1.0.2" + enhanced-resolve "^5.9.2" + es-module-lexer "^0.9.0" + eslint-scope "5.1.1" + events "^3.2.0" + glob-to-regexp "^0.4.1" + graceful-fs "^4.2.9" + json-parse-better-errors "^1.0.2" + loader-runner "^4.2.0" + mime-types "^2.1.27" + neo-async "^2.6.2" + schema-utils "^3.1.0" + tapable "^2.1.1" + terser-webpack-plugin "^5.1.3" + watchpack "^2.3.1" + webpack-sources "^3.2.3" + +websocket-driver@>=0.5.1, websocket-driver@^0.7.4: + version "0.7.4" + resolved "https://registry.yarnpkg.com/websocket-driver/-/websocket-driver-0.7.4.tgz#89ad5295bbf64b480abcba31e4953aca706f5760" + integrity sha512-b17KeDIQVjvb0ssuSDF2cYXSg2iztliJ4B9WdsuB6J952qCPKmnVq4DyW5motImXHDC1cBT/1UezrJVsKw5zjg== + dependencies: + http-parser-js ">=0.5.1" + safe-buffer ">=5.1.0" + websocket-extensions ">=0.1.1" + +websocket-extensions@>=0.1.1: + version "0.1.4" + resolved "https://registry.yarnpkg.com/websocket-extensions/-/websocket-extensions-0.1.4.tgz#7f8473bc839dfd87608adb95d7eb075211578a42" + integrity sha512-OqedPIGOfsDlo31UNwYbCFMSaO9m9G/0faIHj5/dZFDMFqPTcx6UwqyOy3COEaEOg/9VsGIpdqn62W5KhoKSpg== + +whatwg-fetch@>=0.10.0: + version "3.6.2" + resolved "https://registry.yarnpkg.com/whatwg-fetch/-/whatwg-fetch-3.6.2.tgz#dced24f37f2624ed0281725d51d0e2e3fe677f8c" + integrity sha512-bJlen0FcuU/0EMLrdbJ7zOnW6ITZLrZMIarMUVmdKtsGvZna8vxKYaexICWPfZ8qwf9fzNq+UEIZrnSaApt6RA== + +whatwg-url@^5.0.0: + version "5.0.0" + resolved "https://registry.yarnpkg.com/whatwg-url/-/whatwg-url-5.0.0.tgz#966454e8765462e37644d3626f6742ce8b70965d" + integrity sha1-lmRU6HZUYuN2RNNib2dCzotwll0= + dependencies: + tr46 "~0.0.3" + webidl-conversions "^3.0.0" + +which@^2.0.1: + version "2.0.2" + resolved "https://registry.yarnpkg.com/which/-/which-2.0.2.tgz#7c6a8dd0a636a0327e10b59c9286eee93f3f51b1" + integrity sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA== + dependencies: + isexe "^2.0.0" + +wildcard@^2.0.0: + version "2.0.0" + resolved "https://registry.yarnpkg.com/wildcard/-/wildcard-2.0.0.tgz#a77d20e5200c6faaac979e4b3aadc7b3dd7f8fec" + integrity sha512-JcKqAHLPxcdb9KM49dufGXn2x3ssnfjbcaQdLlfZsL9rH9wgDQjUtDxbo8NE0F6SFvydeu1VhZe7hZuHsB2/pw== + +wrappy@1: + version "1.0.2" + resolved "https://registry.yarnpkg.com/wrappy/-/wrappy-1.0.2.tgz#b5243d8f3ec1aa35f1364605bc0d1036e30ab69f" + integrity sha1-tSQ9jz7BqjXxNkYFvA0QNuMKtp8= + +ws@^8.4.2: + version "8.5.0" + resolved "https://registry.yarnpkg.com/ws/-/ws-8.5.0.tgz#bfb4be96600757fe5382de12c670dab984a1ed4f" + integrity sha512-BWX0SWVgLPzYwF8lTzEy1egjhS4S4OEAHfsO8o65WOVsrnSRGaSiUaa9e0ggGlkMTtBlmOpEXiie9RUcBO86qg== + +yallist@^4.0.0: + version "4.0.0" + resolved "https://registry.yarnpkg.com/yallist/-/yallist-4.0.0.tgz#9bb92790d9c0effec63be73519e11a35019a3a72" + integrity sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A== diff --git a/tb_plugins/profiling/tb_plugin/packaging/torch_tb_profiler/meta.yaml b/tb_plugins/profiling/tb_plugin/packaging/torch_tb_profiler/meta.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ab105fd04d21315cf57376e947bd52ddd98d417b --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/packaging/torch_tb_profiler/meta.yaml @@ -0,0 +1,39 @@ +{% set data = load_setup_py_data(setup_file='../../setup.py', from_recipe_dir=True) %} + +package: + name: torch_tb_profiler + version: "{{ data.get('version') }}" + +source: + path: "{{ environ.get('SOURCE_ROOT_DIR', '../..') }}" + +requirements: + host: + - python + - setuptools + - pytorch + + run: + - python + - pandas + - tensorboard + +build: + noarch: python + script: python setup.py install --single-version-externally-managed --record=record.txt + +test: + imports: + - torch_tb_profiler + source_files: + - test + requires: + - tensorboard + - pandas + + +about: + home: https://github.com/pytorch/kineto/tree/main/tb_plugin + license: BSD + license_file: LICENSE + summary: 'Tensoboard Plugin that provides visualization of PyTorch profiling' diff --git a/tb_plugins/profiling/tb_plugin/samples/resnet50_num_workers_0/worker0.1623143089861.pt.trace.json.gz b/tb_plugins/profiling/tb_plugin/samples/resnet50_num_workers_0/worker0.1623143089861.pt.trace.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..769c3eb7843639ac114d183e07304b6d44931452 Binary files /dev/null and b/tb_plugins/profiling/tb_plugin/samples/resnet50_num_workers_0/worker0.1623143089861.pt.trace.json.gz differ diff --git a/tb_plugins/profiling/tb_plugin/samples/resnet50_num_workers_0/worker0.1623143566756.pt.trace.json.gz b/tb_plugins/profiling/tb_plugin/samples/resnet50_num_workers_0/worker0.1623143566756.pt.trace.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..383a066433725823a57b0dfd047d718b65d4741b Binary files /dev/null and b/tb_plugins/profiling/tb_plugin/samples/resnet50_num_workers_0/worker0.1623143566756.pt.trace.json.gz differ diff --git a/tb_plugins/profiling/tb_plugin/samples/resnet50_num_workers_4/worker0.1623212756351.pt.trace.json.gz b/tb_plugins/profiling/tb_plugin/samples/resnet50_num_workers_4/worker0.1623212756351.pt.trace.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..234cf25df597fb99765ee2a49978f5f3c4bbbaf9 Binary files /dev/null and b/tb_plugins/profiling/tb_plugin/samples/resnet50_num_workers_4/worker0.1623212756351.pt.trace.json.gz differ diff --git a/tb_plugins/profiling/tb_plugin/samples/resnet50_num_workers_4/worker0.1623213129365.pt.trace.json.gz b/tb_plugins/profiling/tb_plugin/samples/resnet50_num_workers_4/worker0.1623213129365.pt.trace.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..3e633b5fde9d2e99ea8b07a67771183b7011bbc5 Binary files /dev/null and b/tb_plugins/profiling/tb_plugin/samples/resnet50_num_workers_4/worker0.1623213129365.pt.trace.json.gz differ diff --git a/tb_plugins/profiling/tb_plugin/setup.py b/tb_plugins/profiling/tb_plugin/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..b32dc154a289773ec6d2a69e031363806c074574 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/setup.py @@ -0,0 +1,105 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# -------------------------------------------------------------------------- +import os +import pathlib +import setuptools +import subprocess + + +def read(rel_path): + here = os.path.abspath(os.path.dirname(__file__)) + with open(os.path.join(here, rel_path)) as fp: + return fp.read() + + +def get_version(rel_path): + for line in read(rel_path).splitlines(): + if line.startswith("__version__"): + delim = '"' if '"' in line else "'" + version = line.split(delim)[1] + + if os.getenv('TORCH_TB_PROFILER_BUILD_VERSION'): + version = os.getenv('TORCH_TB_PROFILER_BUILD_VERSION') + return version + + +INSTALL_REQUIRED = [ + "pandas >= 1.0.0", + "tensorboard >= 1.15, !=2.1.0" +] + +TESTS_REQUIRED = INSTALL_REQUIRED + [ + "torch >= 1.8", + "torchvision >= 0.8" +] + +EXTRAS = { + "s3": ["boto3"], + "blob": ["azure-storage-blob"], + "gs": ["google-cloud-storage"] +} + + +class build_fe(setuptools.Command): + """Build the frontend""" + description = "run yarn build on frontend directory" + + user_options = [] + + def initialize_options(self): + pass + + def finalize_options(self): + pass + + def run(self): + cwd = pathlib.Path().absolute() + root = pathlib.Path(__file__).parent.absolute() + os.chdir(root / "fe") + subprocess.run(["yarn", "build:copy"], check=True) + # restore the working directory + os.chdir(cwd) + + +setuptools.setup( + name="torch_tb_profiler", + version=get_version(os.path.join('torch_tb_profiler', '__init__.py')), + description="PyTorch Profiler TensorBoard Plugin", + long_description="PyTorch Profiler TensorBoard Plugin : \ + https://github.com/pytorch/kineto/tree/main/tb_plugin", + url="https://github.com/pytorch/kineto/tree/main/tb_plugin", + author="PyTorch Team", + author_email="packages@pytorch.org", + cmdclass={ + "build_fe": build_fe + }, + packages=setuptools.find_packages(), + package_data={ + "torch_tb_profiler": ["static/**"], + }, + entry_points={ + "tensorboard_plugins": [ + "torch_profiler = torch_tb_profiler.plugin:TorchProfilerPlugin", + ], + }, + python_requires=">=3.6.2", + install_requires=INSTALL_REQUIRED, + tests_require=TESTS_REQUIRED, + classifiers=[ + 'Intended Audience :: Developers', + 'Intended Audience :: Education', + 'Intended Audience :: Science/Research', + 'License :: OSI Approved :: BSD License', + 'Programming Language :: Python :: 3', + 'Topic :: Scientific/Engineering', + 'Topic :: Scientific/Engineering :: Mathematics', + 'Topic :: Scientific/Engineering :: Artificial Intelligence', + 'Topic :: Software Development', + 'Topic :: Software Development :: Libraries', + 'Topic :: Software Development :: Libraries :: Python Modules', + ], + license='BSD-3', + keywords='pytorch tensorboard profile plugin', + extras_require=EXTRAS +) diff --git a/tb_plugins/profiling/tb_plugin/test/gpu_metrics_expected.json b/tb_plugins/profiling/tb_plugin/test/gpu_metrics_expected.json new file mode 100644 index 0000000000000000000000000000000000000000..81f03632c83fa6abb41fa42011f7d7616b7d9d3a --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/test/gpu_metrics_expected.json @@ -0,0 +1,3105 @@ + +{ + "schemaVersion": 1, + + "computeProperties": [ + + { + "id": 0, "name": "Tesla V100-DGXS-32GB", "totalGlobalMem": 34084028416, + "major": 7, "minor": 0, + "maxThreadsPerBlock": 1024, "maxThreadsPerMultiProcessor": 2048, + "regsPerBlock": 65536, "regsPerMultiprocessor": 65536, "warpSize": 32, + "sharedMemPerBlock": 49152, "sharedMemPerMultiprocessor": 98304, + "multiProcessorCount": 80, "sharedMemPerBlockOptin": 98304 + }, + + { + "id": 1, "name": "Tesla V100-DGXS-32GB", "totalGlobalMem": 34087305216, + "major": 7, "minor": 0, + "maxThreadsPerBlock": 1024, "maxThreadsPerMultiProcessor": 2048, + "regsPerBlock": 65536, "regsPerMultiprocessor": 65536, "warpSize": 32, + "sharedMemPerBlock": 49152, "sharedMemPerMultiprocessor": 98304, + "multiProcessorCount": 80, "sharedMemPerBlockOptin": 98304 + }, + + { + "id": 2, "name": "Tesla V100-DGXS-32GB", "totalGlobalMem": 34087305216, + "major": 7, "minor": 0, + "maxThreadsPerBlock": 1024, "maxThreadsPerMultiProcessor": 2048, + "regsPerBlock": 65536, "regsPerMultiprocessor": 65536, "warpSize": 32, + "sharedMemPerBlock": 49152, "sharedMemPerMultiprocessor": 98304, + "multiProcessorCount": 80, "sharedMemPerBlockOptin": 98304 + }, + + { + "id": 3, "name": "Tesla V100-DGXS-32GB", "totalGlobalMem": 34087305216, + "major": 7, "minor": 0, + "maxThreadsPerBlock": 1024, "maxThreadsPerMultiProcessor": 2048, + "regsPerBlock": 65536, "regsPerMultiprocessor": 65536, "warpSize": 32, + "sharedMemPerBlock": 49152, "sharedMemPerMultiprocessor": 98304, + "multiProcessorCount": 80, "sharedMemPerBlockOptin": 98304 + } + ], + "traceEvents": [ + + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24572", + "ts": 1621401187223197, "dur": 21, + "args": { + "Device": 24572, "External id": 2, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::zero_", "pid": 24572, "tid": "24572", + "ts": 1621401187223264, "dur": 5, + "args": { + "Device": 24572, "External id": 3, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::zeros", "pid": 24572, "tid": "24572", + "ts": 1621401187223182, "dur": 99, + "args": { + "Device": 24572, "External id": 1, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24572", + "ts": 1621401187223376, "dur": 19, + "args": { + "Device": 24572, "External id": 5, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24572", + "ts": 1621401187223480, "dur": 18, + "args": { + "Device": 24572, "External id": 7, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::zero_", "pid": 24572, "tid": "24572", + "ts": 1621401187223530, "dur": 5, + "args": { + "Device": 24572, "External id": 8, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::zeros", "pid": 24572, "tid": "24572", + "ts": 1621401187223469, "dur": 72, + "args": { + "Device": 24572, "External id": 6, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24572", + "ts": 1621401187223622, "dur": 19, + "args": { + "Device": 24572, "External id": 10, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::as_strided", "pid": 24572, "tid": "24572", + "ts": 1621401187223790, "dur": 12, + "args": { + "Device": 24572, "External id": 13, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::unsqueeze", "pid": 24572, "tid": "24572", + "ts": 1621401187223777, "dur": 50, + "args": { + "Device": 24572, "External id": 12, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::as_strided", "pid": 24572, "tid": "24572", + "ts": 1621401187223850, "dur": 7, + "args": { + "Device": 24572, "External id": 15, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::unsqueeze", "pid": 24572, "tid": "24572", + "ts": 1621401187223841, "dur": 24, + "args": { + "Device": 24572, "External id": 14, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24572", + "ts": 1621401187223904, "dur": 16, + "args": { + "Device": 24572, "External id": 18, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::resize_", "pid": 24572, "tid": "24572", + "ts": 1621401187223945, "dur": 14, + "args": { + "Device": 24572, "External id": 19, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::_cat", "pid": 24572, "tid": "24572", + "ts": 1621401187223888, "dur": 87, + "args": { + "Device": 24572, "External id": 17, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::cat", "pid": 24572, "tid": "24572", + "ts": 1621401187223876, "dur": 106, + "args": { + "Device": 24572, "External id": 16, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::stack", "pid": 24572, "tid": "24572", + "ts": 1621401187223752, "dur": 245, + "args": { + "Device": 24572, "External id": 11, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 22 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::as_strided", "pid": 24572, "tid": "24572", + "ts": 1621401187224094, "dur": 12, + "args": { + "Device": 24572, "External id": 22, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::unsqueeze", "pid": 24572, "tid": "24572", + "ts": 1621401187224074, "dur": 43, + "args": { + "Device": 24572, "External id": 21, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::as_strided", "pid": 24572, "tid": "24572", + "ts": 1621401187224137, "dur": 6, + "args": { + "Device": 24572, "External id": 24, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::unsqueeze", "pid": 24572, "tid": "24572", + "ts": 1621401187224128, "dur": 21, + "args": { + "Device": 24572, "External id": 23, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24572", + "ts": 1621401187224184, "dur": 15, + "args": { + "Device": 24572, "External id": 27, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::resize_", "pid": 24572, "tid": "24572", + "ts": 1621401187224223, "dur": 12, + "args": { + "Device": 24572, "External id": 28, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::_cat", "pid": 24572, "tid": "24572", + "ts": 1621401187224169, "dur": 79, + "args": { + "Device": 24572, "External id": 26, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::cat", "pid": 24572, "tid": "24572", + "ts": 1621401187224159, "dur": 96, + "args": { + "Device": 24572, "External id": 25, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::stack", "pid": 24572, "tid": "24572", + "ts": 1621401187224056, "dur": 213, + "args": { + "Device": 24572, "External id": 20, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 22 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "enumerate(DataLoader)#_SingleProcessDataLoaderIter.__next__", "pid": 24572, "tid": "24572", + "ts": 1621401187223604, "dur": 725, + "args": { + "Device": 24572, "External id": 9, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty_strided", "pid": 24572, "tid": "24572", + "ts": 1621401187224415, "dur": 54, + "args": { + "Device": 24572, "External id": 30, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::copy_", "pid": 24572, "tid": "24572", + "ts": 1621401187224496, "dur": 80, + "args": { + "Device": 24572, "External id": 31, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 22 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::to", "pid": 24572, "tid": "24572", + "ts": 1621401187224398, "dur": 193, + "args": { + "Device": 24572, "External id": 29, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 22 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty_strided", "pid": 24572, "tid": "24572", + "ts": 1621401187224645, "dur": 51, + "args": { + "Device": 24572, "External id": 33, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::copy_", "pid": 24572, "tid": "24572", + "ts": 1621401187224720, "dur": 65, + "args": { + "Device": 24572, "External id": 34, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 22 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::to", "pid": 24572, "tid": "24572", + "ts": 1621401187224631, "dur": 168, + "args": { + "Device": 24572, "External id": 32, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 22 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::as_strided", "pid": 24572, "tid": "24572", + "ts": 1621401187224956, "dur": 14, + "args": { + "Device": 24572, "External id": 38, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::transpose", "pid": 24572, "tid": "24572", + "ts": 1621401187224945, "dur": 37, + "args": { + "Device": 24572, "External id": 37, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::t", "pid": 24572, "tid": "24572", + "ts": 1621401187224917, "dur": 101, + "args": { + "Device": 24572, "External id": 36, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 22 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::view", "pid": 24572, "tid": "24572", + "ts": 1621401187225058, "dur": 33, + "args": { + "Device": 24572, "External id": 40, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 23 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24572", + "ts": 1621401187225181, "dur": 41, + "args": { + "Device": 24572, "External id": 42, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::mm", "pid": 24572, "tid": "24572", + "ts": 1621401187225112, "dur": 197, + "args": { + "Device": 24572, "External id": 41, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 23 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::view", "pid": 24572, "tid": "24572", + "ts": 1621401187225367, "dur": 17, + "args": { + "Device": 24572, "External id": 44, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::_unsafe_view", "pid": 24572, "tid": "24572", + "ts": 1621401187225336, "dur": 79, + "args": { + "Device": 24572, "External id": 43, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 24 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::matmul", "pid": 24572, "tid": "24572", + "ts": 1621401187225037, "dur": 394, + "args": { + "Device": 24572, "External id": 39, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 23 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::add_", "pid": 24572, "tid": "24572", + "ts": 1621401187225449, "dur": 107, + "args": { + "Device": 24572, "External id": 45, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 25 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::linear", "pid": 24572, "tid": "24572", + "ts": 1621401187224907, "dur": 664, + "args": { + "Device": 24572, "External id": 35, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 22 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24572", + "ts": 1621401187225662, "dur": 25, + "args": { + "Device": 24572, "External id": 47, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::resize_", "pid": 24572, "tid": "24572", + "ts": 1621401187225746, "dur": 30, + "args": { + "Device": 24572, "External id": 50, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::clamp_min", "pid": 24572, "tid": "24572", + "ts": 1621401187225721, "dur": 105, + "args": { + "Device": 24572, "External id": 49, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::clamp", "pid": 24572, "tid": "24572", + "ts": 1621401187225709, "dur": 128, + "args": { + "Device": 24572, "External id": 48, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::clamp", "pid": 24572, "tid": "24572", + "ts": 1621401187225606, "dur": 263, + "args": { + "Device": 24572, "External id": 46, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 26 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::as_strided", "pid": 24572, "tid": "24572", + "ts": 1621401187225978, "dur": 14, + "args": { + "Device": 24572, "External id": 54, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::transpose", "pid": 24572, "tid": "24572", + "ts": 1621401187225968, "dur": 36, + "args": { + "Device": 24572, "External id": 53, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::t", "pid": 24572, "tid": "24572", + "ts": 1621401187225941, "dur": 98, + "args": { + "Device": 24572, "External id": 52, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 27 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::view", "pid": 24572, "tid": "24572", + "ts": 1621401187226077, "dur": 60, + "args": { + "Device": 24572, "External id": 56, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 28 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24572", + "ts": 1621401187226233, "dur": 41, + "args": { + "Device": 24572, "External id": 58, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::mm", "pid": 24572, "tid": "24572", + "ts": 1621401187226161, "dur": 197, + "args": { + "Device": 24572, "External id": 57, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 29 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::view", "pid": 24572, "tid": "24572", + "ts": 1621401187226416, "dur": 17, + "args": { + "Device": 24572, "External id": 60, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::_unsafe_view", "pid": 24572, "tid": "24572", + "ts": 1621401187226384, "dur": 79, + "args": { + "Device": 24572, "External id": 59, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 30 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::matmul", "pid": 24572, "tid": "24572", + "ts": 1621401187226057, "dur": 422, + "args": { + "Device": 24572, "External id": 55, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 28 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::add_", "pid": 24572, "tid": "24572", + "ts": 1621401187226497, "dur": 103, + "args": { + "Device": 24572, "External id": 61, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 31 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::linear", "pid": 24572, "tid": "24572", + "ts": 1621401187225932, "dur": 683, + "args": { + "Device": 24572, "External id": 51, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 27 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::broadcast_tensors", "pid": 24572, "tid": "24572", + "ts": 1621401187226708, "dur": 11, + "args": { + "Device": 24572, "External id": 62, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 32 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24572", + "ts": 1621401187226827, "dur": 41, + "args": { + "Device": 24572, "External id": 64, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24572", + "ts": 1621401187226955, "dur": 35, + "args": { + "Device": 24572, "External id": 66, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::as_strided", "pid": 24572, "tid": "24572", + "ts": 1621401187227020, "dur": 11, + "args": { + "Device": 24572, "External id": 67, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::sum", "pid": 24572, "tid": "24572", + "ts": 1621401187226930, "dur": 176, + "args": { + "Device": 24572, "External id": 65, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::mse_loss", "pid": 24572, "tid": "24572", + "ts": 1621401187226753, "dur": 445, + "args": { + "Device": 24572, "External id": 63, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 32 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24572", + "ts": 1621401187227327, "dur": 21, + "args": { + "Device": 24572, "External id": 69, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::zero_", "pid": 24572, "tid": "24572", + "ts": 1621401187227368, "dur": 5, + "args": { + "Device": 24572, "External id": 70, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::zeros", "pid": 24572, "tid": "24572", + "ts": 1621401187227314, "dur": 65, + "args": { + "Device": 24572, "External id": 68, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24572", + "ts": 1621401187227464, "dur": 18, + "args": { + "Device": 24572, "External id": 72, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::fill_", "pid": 24572, "tid": "24572", + "ts": 1621401187227576, "dur": 49, + "args": { + "Device": 24572, "External id": 74, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::zero_", "pid": 24572, "tid": "24572", + "ts": 1621401187227553, "dur": 97, + "args": { + "Device": 24572, "External id": 73, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 33 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::fill_", "pid": 24572, "tid": "24572", + "ts": 1621401187227707, "dur": 43, + "args": { + "Device": 24572, "External id": 76, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::zero_", "pid": 24572, "tid": "24572", + "ts": 1621401187227689, "dur": 79, + "args": { + "Device": 24572, "External id": 75, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 33 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::fill_", "pid": 24572, "tid": "24572", + "ts": 1621401187227823, "dur": 42, + "args": { + "Device": 24572, "External id": 78, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::zero_", "pid": 24572, "tid": "24572", + "ts": 1621401187227805, "dur": 77, + "args": { + "Device": 24572, "External id": 77, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 33 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::fill_", "pid": 24572, "tid": "24572", + "ts": 1621401187227937, "dur": 41, + "args": { + "Device": 24572, "External id": 80, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::zero_", "pid": 24572, "tid": "24572", + "ts": 1621401187227919, "dur": 77, + "args": { + "Device": 24572, "External id": 79, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 33 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "Optimizer.zero_grad#SGD.zero_grad", "pid": 24572, "tid": "24572", + "ts": 1621401187227446, "dur": 606, + "args": { + "Device": 24572, "External id": 71, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty_strided", "pid": 24572, "tid": "24572", + "ts": 1621401187228150, "dur": 53, + "args": { + "Device": 24572, "External id": 83, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty_like", "pid": 24572, "tid": "24572", + "ts": 1621401187228137, "dur": 81, + "args": { + "Device": 24572, "External id": 82, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::fill_", "pid": 24572, "tid": "24572", + "ts": 1621401187228235, "dur": 50, + "args": { + "Device": 24572, "External id": 84, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::ones_like", "pid": 24572, "tid": "24572", + "ts": 1621401187228128, "dur": 169, + "args": { + "Device": 24572, "External id": 81, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24610", + "ts": 1621401187228708, "dur": 79, + "args": { + "Device": 24572, "External id": 89, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty_like", "pid": 24572, "tid": "24610", + "ts": 1621401187228680, "dur": 146, + "args": { + "Device": 24572, "External id": 88, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::fill_", "pid": 24572, "tid": "24610", + "ts": 1621401187228885, "dur": 93, + "args": { + "Device": 24572, "External id": 91, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::zero_", "pid": 24572, "tid": "24610", + "ts": 1621401187228858, "dur": 147, + "args": { + "Device": 24572, "External id": 90, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::zeros_like", "pid": 24572, "tid": "24610", + "ts": 1621401187228647, "dur": 369, + "args": { + "Device": 24572, "External id": 87, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::mse_loss_backward", "pid": 24572, "tid": "24610", + "ts": 1621401187229048, "dur": 122, + "args": { + "Device": 24572, "External id": 92, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::mse_loss_backward", "pid": 24572, "tid": "24610", + "ts": 1621401187228603, "dur": 614, + "args": { + "Device": 24572, "External id": 86, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "MseLossBackward", "pid": 24572, "tid": "24610", + "ts": 1621401187228516, "dur": 727, + "args": { + "Device": 24572, "External id": 85, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 1, "Sequence number": 32 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "AddBackward1", "pid": 24572, "tid": "24610", + "ts": 1621401187229384, "dur": 17, + "args": { + "Device": 24572, "External id": 93, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 1, "Sequence number": 31 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24610", + "ts": 1621401187229506, "dur": 73, + "args": { + "Device": 24572, "External id": 95, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::sum", "pid": 24572, "tid": "24610", + "ts": 1621401187229459, "dur": 279, + "args": { + "Device": 24572, "External id": 94, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::view", "pid": 24572, "tid": "24610", + "ts": 1621401187229788, "dur": 65, + "args": { + "Device": 24572, "External id": 96, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::add_", "pid": 24572, "tid": "24610", + "ts": 1621401187230059, "dur": 131, + "args": { + "Device": 24572, "External id": 98, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "torch::autograd::AccumulateGrad", "pid": 24572, "tid": "24610", + "ts": 1621401187230028, "dur": 228, + "args": { + "Device": 24572, "External id": 97, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::view", "pid": 24572, "tid": "24610", + "ts": 1621401187230405, "dur": 61, + "args": { + "Device": 24572, "External id": 101, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::reshape", "pid": 24572, "tid": "24610", + "ts": 1621401187230383, "dur": 107, + "args": { + "Device": 24572, "External id": 100, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "UnsafeViewBackward", "pid": 24572, "tid": "24610", + "ts": 1621401187230354, "dur": 146, + "args": { + "Device": 24572, "External id": 99, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 1, "Sequence number": 30 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::as_strided", "pid": 24572, "tid": "24610", + "ts": 1621401187230751, "dur": 22, + "args": { + "Device": 24572, "External id": 105, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::transpose", "pid": 24572, "tid": "24610", + "ts": 1621401187230732, "dur": 65, + "args": { + "Device": 24572, "External id": 104, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::t", "pid": 24572, "tid": "24610", + "ts": 1621401187230710, "dur": 124, + "args": { + "Device": 24572, "External id": 103, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::conj", "pid": 24572, "tid": "24610", + "ts": 1621401187230862, "dur": 7, + "args": { + "Device": 24572, "External id": 106, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24610", + "ts": 1621401187230935, "dur": 73, + "args": { + "Device": 24572, "External id": 108, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::mm", "pid": 24572, "tid": "24610", + "ts": 1621401187230889, "dur": 235, + "args": { + "Device": 24572, "External id": 107, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::as_strided", "pid": 24572, "tid": "24610", + "ts": 1621401187231211, "dur": 23, + "args": { + "Device": 24572, "External id": 111, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::transpose", "pid": 24572, "tid": "24610", + "ts": 1621401187231191, "dur": 69, + "args": { + "Device": 24572, "External id": 110, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::t", "pid": 24572, "tid": "24610", + "ts": 1621401187231168, "dur": 129, + "args": { + "Device": 24572, "External id": 109, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::as_strided", "pid": 24572, "tid": "24610", + "ts": 1621401187231376, "dur": 17, + "args": { + "Device": 24572, "External id": 114, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::transpose", "pid": 24572, "tid": "24610", + "ts": 1621401187231360, "dur": 49, + "args": { + "Device": 24572, "External id": 113, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::t", "pid": 24572, "tid": "24610", + "ts": 1621401187231340, "dur": 100, + "args": { + "Device": 24572, "External id": 112, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::conj", "pid": 24572, "tid": "24610", + "ts": 1621401187231465, "dur": 6, + "args": { + "Device": 24572, "External id": 115, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24610", + "ts": 1621401187231534, "dur": 72, + "args": { + "Device": 24572, "External id": 117, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::mm", "pid": 24572, "tid": "24610", + "ts": 1621401187231491, "dur": 225, + "args": { + "Device": 24572, "External id": 116, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "MmBackward", "pid": 24572, "tid": "24610", + "ts": 1621401187230626, "dur": 1124, + "args": { + "Device": 24572, "External id": 102, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 1, "Sequence number": 29 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::view", "pid": 24572, "tid": "24610", + "ts": 1621401187231992, "dur": 61, + "args": { + "Device": 24572, "External id": 120, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::reshape", "pid": 24572, "tid": "24610", + "ts": 1621401187231970, "dur": 108, + "args": { + "Device": 24572, "External id": 119, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "ViewBackward", "pid": 24572, "tid": "24610", + "ts": 1621401187231941, "dur": 166, + "args": { + "Device": 24572, "External id": 118, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 1, "Sequence number": 28 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::as_strided", "pid": 24572, "tid": "24610", + "ts": 1621401187232305, "dur": 21, + "args": { + "Device": 24572, "External id": 124, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::transpose", "pid": 24572, "tid": "24610", + "ts": 1621401187232286, "dur": 62, + "args": { + "Device": 24572, "External id": 123, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::t", "pid": 24572, "tid": "24610", + "ts": 1621401187232265, "dur": 123, + "args": { + "Device": 24572, "External id": 122, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "TBackward", "pid": 24572, "tid": "24610", + "ts": 1621401187232239, "dur": 161, + "args": { + "Device": 24572, "External id": 121, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 1, "Sequence number": 27 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::add_", "pid": 24572, "tid": "24610", + "ts": 1621401187232535, "dur": 85, + "args": { + "Device": 24572, "External id": 126, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "torch::autograd::AccumulateGrad", "pid": 24572, "tid": "24610", + "ts": 1621401187232515, "dur": 148, + "args": { + "Device": 24572, "External id": 125, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24610", + "ts": 1621401187232790, "dur": 47, + "args": { + "Device": 24572, "External id": 129, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::fill_", "pid": 24572, "tid": "24610", + "ts": 1621401187232866, "dur": 68, + "args": { + "Device": 24572, "External id": 130, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::scalar_tensor", "pid": 24572, "tid": "24610", + "ts": 1621401187232776, "dur": 174, + "args": { + "Device": 24572, "External id": 128, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24610", + "ts": 1621401187233023, "dur": 27, + "args": { + "Device": 24572, "External id": 132, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::_local_scalar_dense", "pid": 24572, "tid": "24610", + "ts": 1621401187233192, "dur": 6, + "args": { + "Device": 24572, "External id": 135, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::item", "pid": 24572, "tid": "24610", + "ts": 1621401187233184, "dur": 24, + "args": { + "Device": 24572, "External id": 134, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::resize_", "pid": 24572, "tid": "24610", + "ts": 1621401187233251, "dur": 41, + "args": { + "Device": 24572, "External id": 136, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::ge", "pid": 24572, "tid": "24610", + "ts": 1621401187233168, "dur": 182, + "args": { + "Device": 24572, "External id": 133, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::ge", "pid": 24572, "tid": "24610", + "ts": 1621401187232971, "dur": 404, + "args": { + "Device": 24572, "External id": 131, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::as_strided", "pid": 24572, "tid": "24610", + "ts": 1621401187233430, "dur": 15, + "args": { + "Device": 24572, "External id": 139, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::expand", "pid": 24572, "tid": "24610", + "ts": 1621401187233414, "dur": 62, + "args": { + "Device": 24572, "External id": 138, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::as_strided", "pid": 24572, "tid": "24610", + "ts": 1621401187233508, "dur": 10, + "args": { + "Device": 24572, "External id": 141, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::expand", "pid": 24572, "tid": "24610", + "ts": 1621401187233494, "dur": 48, + "args": { + "Device": 24572, "External id": 140, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::as_strided", "pid": 24572, "tid": "24610", + "ts": 1621401187233571, "dur": 10, + "args": { + "Device": 24572, "External id": 143, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::expand", "pid": 24572, "tid": "24610", + "ts": 1621401187233558, "dur": 43, + "args": { + "Device": 24572, "External id": 142, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24610", + "ts": 1621401187233649, "dur": 46, + "args": { + "Device": 24572, "External id": 145, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::_s_where", "pid": 24572, "tid": "24610", + "ts": 1621401187233620, "dur": 167, + "args": { + "Device": 24572, "External id": 144, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::where", "pid": 24572, "tid": "24610", + "ts": 1621401187233398, "dur": 409, + "args": { + "Device": 24572, "External id": 137, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "ClampBackward1", "pid": 24572, "tid": "24610", + "ts": 1621401187232724, "dur": 1110, + "args": { + "Device": 24572, "External id": 127, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 1, "Sequence number": 26 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "AddBackward1", "pid": 24572, "tid": "24610", + "ts": 1621401187233941, "dur": 12, + "args": { + "Device": 24572, "External id": 146, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 1, "Sequence number": 25 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24610", + "ts": 1621401187234021, "dur": 46, + "args": { + "Device": 24572, "External id": 148, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::sum", "pid": 24572, "tid": "24610", + "ts": 1621401187233990, "dur": 182, + "args": { + "Device": 24572, "External id": 147, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::view", "pid": 24572, "tid": "24610", + "ts": 1621401187234208, "dur": 43, + "args": { + "Device": 24572, "External id": 149, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::add_", "pid": 24572, "tid": "24610", + "ts": 1621401187234378, "dur": 84, + "args": { + "Device": 24572, "External id": 151, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "torch::autograd::AccumulateGrad", "pid": 24572, "tid": "24610", + "ts": 1621401187234357, "dur": 144, + "args": { + "Device": 24572, "External id": 150, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::view", "pid": 24572, "tid": "24610", + "ts": 1621401187234593, "dur": 39, + "args": { + "Device": 24572, "External id": 154, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::reshape", "pid": 24572, "tid": "24610", + "ts": 1621401187234580, "dur": 67, + "args": { + "Device": 24572, "External id": 153, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "UnsafeViewBackward", "pid": 24572, "tid": "24610", + "ts": 1621401187234561, "dur": 92, + "args": { + "Device": 24572, "External id": 152, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 1, "Sequence number": 24 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::as_strided", "pid": 24572, "tid": "24610", + "ts": 1621401187234803, "dur": 14, + "args": { + "Device": 24572, "External id": 158, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::transpose", "pid": 24572, "tid": "24610", + "ts": 1621401187234792, "dur": 41, + "args": { + "Device": 24572, "External id": 157, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::t", "pid": 24572, "tid": "24610", + "ts": 1621401187234778, "dur": 79, + "args": { + "Device": 24572, "External id": 156, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::conj", "pid": 24572, "tid": "24610", + "ts": 1621401187234874, "dur": 4, + "args": { + "Device": 24572, "External id": 159, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24610", + "ts": 1621401187234918, "dur": 47, + "args": { + "Device": 24572, "External id": 161, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::mm", "pid": 24572, "tid": "24610", + "ts": 1621401187234890, "dur": 149, + "args": { + "Device": 24572, "External id": 160, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::as_strided", "pid": 24572, "tid": "24610", + "ts": 1621401187235092, "dur": 15, + "args": { + "Device": 24572, "External id": 164, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::transpose", "pid": 24572, "tid": "24610", + "ts": 1621401187235080, "dur": 39, + "args": { + "Device": 24572, "External id": 163, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::t", "pid": 24572, "tid": "24610", + "ts": 1621401187235067, "dur": 75, + "args": { + "Device": 24572, "External id": 162, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "MmBackward", "pid": 24572, "tid": "24610", + "ts": 1621401187234734, "dur": 424, + "args": { + "Device": 24572, "External id": 155, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 1, "Sequence number": 23 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::as_strided", "pid": 24572, "tid": "24610", + "ts": 1621401187235312, "dur": 13, + "args": { + "Device": 24572, "External id": 168, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::transpose", "pid": 24572, "tid": "24610", + "ts": 1621401187235301, "dur": 40, + "args": { + "Device": 24572, "External id": 167, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::t", "pid": 24572, "tid": "24610", + "ts": 1621401187235288, "dur": 78, + "args": { + "Device": 24572, "External id": 166, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "TBackward", "pid": 24572, "tid": "24610", + "ts": 1621401187235271, "dur": 103, + "args": { + "Device": 24572, "External id": 165, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 1, "Sequence number": 22 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::add_", "pid": 24572, "tid": "24610", + "ts": 1621401187235487, "dur": 85, + "args": { + "Device": 24572, "External id": 170, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "torch::autograd::AccumulateGrad", "pid": 24572, "tid": "24610", + "ts": 1621401187235467, "dur": 147, + "args": { + "Device": 24572, "External id": 169, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24572", + "ts": 1621401187235803, "dur": 24, + "args": { + "Device": 24572, "External id": 172, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::zero_", "pid": 24572, "tid": "24572", + "ts": 1621401187235850, "dur": 5, + "args": { + "Device": 24572, "External id": 173, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::zeros", "pid": 24572, "tid": "24572", + "ts": 1621401187235787, "dur": 75, + "args": { + "Device": 24572, "External id": 171, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24572", + "ts": 1621401187235954, "dur": 20, + "args": { + "Device": 24572, "External id": 175, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::add_", "pid": 24572, "tid": "24572", + "ts": 1621401187236091, "dur": 82, + "args": { + "Device": 24572, "External id": 176, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::add_", "pid": 24572, "tid": "24572", + "ts": 1621401187236221, "dur": 70, + "args": { + "Device": 24572, "External id": 177, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::add_", "pid": 24572, "tid": "24572", + "ts": 1621401187236334, "dur": 68, + "args": { + "Device": 24572, "External id": 178, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::add_", "pid": 24572, "tid": "24572", + "ts": 1621401187236444, "dur": 68, + "args": { + "Device": 24572, "External id": 179, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "Optimizer.step#SGD.step", "pid": 24572, "tid": "24572", + "ts": 1621401187235935, "dur": 663, + "args": { + "Device": 24572, "External id": 174, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "ProfilerStep#2", "pid": 24572, "tid": "24572", + "ts": 1621401187223358, "dur": 13410, + "args": { + "Device": 24572, "External id": 4, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Memcpy", + "name": "Memcpy HtoD (Pageable -> Device)", "pid": 0, "tid": "stream 7", + "ts": 1621401187224556, "dur": 1, + "args": { + "device": 0, "context": 1, + "stream": 7, "correlation": 311, "external id": 31, + "bytes": 640, "memory bandwidth (GB/s)": 0.46511627906976744 + } + }, + { + "ph": "f", "id": 311, "pid": 0, "tid": "stream 7", "ts": 1621401187224556, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaMemcpyAsync", "pid": 24572, "tid": "24572", + "ts": 1621401187224533, "dur": 20, + "args": { + "cbid": 41, "correlation": 311, + "external id": 31, "external ts": 1621401187224496 + } + }, + { + "ph": "s", "id": 311, "pid": 24572, "tid": 24572, "ts": 1621401187224533, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaStreamSynchronize", "pid": 24572, "tid": "24572", + "ts": 1621401187224554, "dur": 8, + "args": { + "cbid": 131, "correlation": 312, + "external id": 31, "external ts": 1621401187224496 + } + }, + { + "ph": "X", "cat": "Memcpy", + "name": "Memcpy HtoD (Pageable -> Device)", "pid": 0, "tid": "stream 7", + "ts": 1621401187224767, "dur": 1, + "args": { + "device": 0, "context": 1, + "stream": 7, "correlation": 323, "external id": 34, + "bytes": 128, "memory bandwidth (GB/s)": 0.09523809523809523 + } + }, + { + "ph": "f", "id": 323, "pid": 0, "tid": "stream 7", "ts": 1621401187224767, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaMemcpyAsync", "pid": 24572, "tid": "24572", + "ts": 1621401187224752, "dur": 12, + "args": { + "cbid": 41, "correlation": 323, + "external id": 34, "external ts": 1621401187224720 + } + }, + { + "ph": "s", "id": 323, "pid": 24572, "tid": 24572, "ts": 1621401187224752, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaStreamSynchronize", "pid": 24572, "tid": "24572", + "ts": 1621401187224765, "dur": 7, + "args": { + "cbid": 131, "correlation": 324, + "external id": 34, "external ts": 1621401187224720 + } + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 24572, "tid": "24572", + "ts": 1621401187225253, "dur": 2, + "args": { + "cbid": 251, "correlation": 332, + "external id": 41, "external ts": 1621401187225112 + } + }, + { + "ph": "X", "cat": "Kernel", + "name": "void gemmSN_TN_kernel_64addr, cublasGemvTensorStridedBatched >(cublasGemmSmallNParams, cublasGemvTensorStridedBatched, float>)", "pid": 0, "tid": "stream 7", + "ts": 1621401187225275, "dur": 3, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 333, "external id": 41, + "registers per thread": 72, + "shared memory": 13824, + "blocks per SM": 0.025, + "warps per SM": 0.1, + "grid": [1, 2, 1], + "block": [128, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 333, "pid": 0, "tid": "stream 7", "ts": 1621401187225275, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24572", + "ts": 1621401187225258, "dur": 16, + "args": { + "cbid": 211, "correlation": 333, + "external id": 41, "external ts": 1621401187225112 + } + }, + { + "ph": "s", "id": 333, "pid": 24572, "tid": 24572, "ts": 1621401187225258, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::unrolled_elementwise_kernel, at::detail::Array, OffsetCalculator<2, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, at::native::AddFunctor, at::detail::Array, OffsetCalculator<2, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast)", "pid": 0, "tid": "stream 7", + "ts": 1621401187225530, "dur": 2, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 338, "external id": 45, + "registers per thread": 22, + "shared memory": 0, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [64, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 338, "pid": 0, "tid": "stream 7", "ts": 1621401187225530, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24572", + "ts": 1621401187225512, "dur": 16, + "args": { + "cbid": 211, "correlation": 338, + "external id": 45, "external ts": 1621401187225449 + } + }, + { + "ph": "s", "id": 338, "pid": 24572, "tid": 24572, "ts": 1621401187225512, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::clamp_min_scalar_kernel_impl(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::clamp_min_scalar_kernel_impl(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 0, "tid": "stream 7", + "ts": 1621401187225820, "dur": 1, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 352, "external id": 49, + "registers per thread": 18, + "shared memory": 0, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [64, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 352, "pid": 0, "tid": "stream 7", "ts": 1621401187225820, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24572", + "ts": 1621401187225803, "dur": 15, + "args": { + "cbid": 211, "correlation": 352, + "external id": 49, "external ts": 1621401187225721 + } + }, + { + "ph": "s", "id": 352, "pid": 24572, "tid": 24572, "ts": 1621401187225803, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 24572, "tid": "24572", + "ts": 1621401187226305, "dur": 2, + "args": { + "cbid": 251, "correlation": 363, + "external id": 57, "external ts": 1621401187226161 + } + }, + { + "ph": "X", "cat": "Kernel", + "name": "void gemmSN_TN_kernel_64addr, cublasGemvTensorStridedBatched >(cublasGemmSmallNParams, cublasGemvTensorStridedBatched, float>)", "pid": 0, "tid": "stream 7", + "ts": 1621401187226325, "dur": 2, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 364, "external id": 57, + "registers per thread": 72, + "shared memory": 13824, + "blocks per SM": 0.025, + "warps per SM": 0.1, + "grid": [1, 2, 1], + "block": [128, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 364, "pid": 0, "tid": "stream 7", "ts": 1621401187226325, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24572", + "ts": 1621401187226309, "dur": 15, + "args": { + "cbid": 211, "correlation": 364, + "external id": 57, "external ts": 1621401187226161 + } + }, + { + "ph": "s", "id": 364, "pid": 24572, "tid": 24572, "ts": 1621401187226309, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::unrolled_elementwise_kernel, at::detail::Array, OffsetCalculator<2, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, at::native::AddFunctor, at::detail::Array, OffsetCalculator<2, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast)", "pid": 0, "tid": "stream 7", + "ts": 1621401187226575, "dur": 2, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 369, "external id": 61, + "registers per thread": 22, + "shared memory": 0, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [64, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 369, "pid": 0, "tid": "stream 7", "ts": 1621401187226575, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24572", + "ts": 1621401187226558, "dur": 15, + "args": { + "cbid": 211, "correlation": 369, + "external id": 61, "external ts": 1621401187226497 + } + }, + { + "ph": "s", "id": 369, "pid": 24572, "tid": 24572, "ts": 1621401187226558, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::vectorized_elementwise_kernel<4, at::native::mse_kernel_cuda(at::TensorIterator&)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(float, float)#1}, at::detail::Array >(int, at::native::mse_kernel_cuda(at::TensorIterator&)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(float, float)#1}, at::detail::Array)", "pid": 0, "tid": "stream 7", + "ts": 1621401187226912, "dur": 1, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 377, "external id": 63, + "registers per thread": 20, + "shared memory": 0, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [64, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 377, "pid": 0, "tid": "stream 7", "ts": 1621401187226912, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24572", + "ts": 1621401187226895, "dur": 16, + "args": { + "cbid": 211, "correlation": 377, + "external id": 63, "external ts": 1621401187226753 + } + }, + { + "ph": "s", "id": 377, "pid": 24572, "tid": 24572, "ts": 1621401187226895, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": "stream 7", + "ts": 1621401187227092, "dur": 2, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 388, "external id": 65, + "registers per thread": 32, + "shared memory": 16, + "blocks per SM": 0.0125, + "warps per SM": 0.0125, + "grid": [1, 1, 1], + "block": [32, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 388, "pid": 0, "tid": "stream 7", "ts": 1621401187227092, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24572", + "ts": 1621401187227075, "dur": 15, + "args": { + "cbid": 211, "correlation": 388, + "external id": 65, "external ts": 1621401187226930 + } + }, + { + "ph": "s", "id": 388, "pid": 24572, "tid": 24572, "ts": 1621401187227075, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 0, "tid": "stream 7", + "ts": 1621401187227619, "dur": 1, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 395, "external id": 74, + "registers per thread": 16, + "shared memory": 0, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [64, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 395, "pid": 0, "tid": "stream 7", "ts": 1621401187227619, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24572", + "ts": 1621401187227601, "dur": 16, + "args": { + "cbid": 211, "correlation": 395, + "external id": 74, "external ts": 1621401187227576 + } + }, + { + "ph": "s", "id": 395, "pid": 24572, "tid": 24572, "ts": 1621401187227601, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 0, "tid": "stream 7", + "ts": 1621401187227745, "dur": 1, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 402, "external id": 76, + "registers per thread": 16, + "shared memory": 0, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [64, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 402, "pid": 0, "tid": "stream 7", "ts": 1621401187227745, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24572", + "ts": 1621401187227729, "dur": 14, + "args": { + "cbid": 211, "correlation": 402, + "external id": 76, "external ts": 1621401187227707 + } + }, + { + "ph": "s", "id": 402, "pid": 24572, "tid": 24572, "ts": 1621401187227729, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 0, "tid": "stream 7", + "ts": 1621401187227859, "dur": 1, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 409, "external id": 78, + "registers per thread": 16, + "shared memory": 0, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [64, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 409, "pid": 0, "tid": "stream 7", "ts": 1621401187227859, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24572", + "ts": 1621401187227844, "dur": 13, + "args": { + "cbid": 211, "correlation": 409, + "external id": 78, "external ts": 1621401187227823 + } + }, + { + "ph": "s", "id": 409, "pid": 24572, "tid": 24572, "ts": 1621401187227844, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 0, "tid": "stream 7", + "ts": 1621401187227973, "dur": 1, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 416, "external id": 80, + "registers per thread": 16, + "shared memory": 0, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [64, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 416, "pid": 0, "tid": "stream 7", "ts": 1621401187227973, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24572", + "ts": 1621401187227958, "dur": 13, + "args": { + "cbid": 211, "correlation": 416, + "external id": 80, "external ts": 1621401187227937 + } + }, + { + "ph": "s", "id": 416, "pid": 24572, "tid": 24572, "ts": 1621401187227958, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 0, "tid": "stream 7", + "ts": 1621401187228279, "dur": 1, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 429, "external id": 84, + "registers per thread": 16, + "shared memory": 0, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [64, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 429, "pid": 0, "tid": "stream 7", "ts": 1621401187228279, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24572", + "ts": 1621401187228262, "dur": 15, + "args": { + "cbid": 211, "correlation": 429, + "external id": 84, "external ts": 1621401187228235 + } + }, + { + "ph": "s", "id": 429, "pid": 24572, "tid": 24572, "ts": 1621401187228262, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 0, "tid": "stream 7", + "ts": 1621401187228962, "dur": 1, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 440, "external id": 91, + "registers per thread": 16, + "shared memory": 0, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [64, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 440, "pid": 0, "tid": "stream 7", "ts": 1621401187228962, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24610", + "ts": 1621401187228932, "dur": 30, + "args": { + "cbid": 211, "correlation": 440, + "external id": 91, "external ts": 1621401187228885 + } + }, + { + "ph": "s", "id": 440, "pid": 24572, "tid": 24610, "ts": 1621401187228932, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::unrolled_elementwise_kernel, OffsetCalculator<3, unsigned int>, at::detail::Array<1, unsigned int>, at::native::memory::LoadWithoutCast, OffsetCalculator::StoreWithoutCast>(int, at::native::mse_backward_cuda_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(float, float, float)#1}, at::detail::Array, OffsetCalculator<3, unsigned int>, at::detail::Array<1, unsigned int>, at::native::memory::LoadWithoutCast, OffsetCalculator::StoreWithoutCast)", "pid": 0, "tid": "stream 7", + "ts": 1621401187229153, "dur": 2, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 446, "external id": 92, + "registers per thread": 28, + "shared memory": 0, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [64, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 446, "pid": 0, "tid": "stream 7", "ts": 1621401187229153, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24610", + "ts": 1621401187229127, "dur": 26, + "args": { + "cbid": 211, "correlation": 446, + "external id": 92, "external ts": 1621401187229048 + } + }, + { + "ph": "s", "id": 446, "pid": 24572, "tid": 24610, "ts": 1621401187229127, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::reduce_kernel<256, 2, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": "stream 7", + "ts": 1621401187229711, "dur": 4, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 460, "external id": 94, + "registers per thread": 35, + "shared memory": 16, + "blocks per SM": 0.0125, + "warps per SM": 0.00625, + "grid": [1, 1, 1], + "block": [1, 16, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 460, "pid": 0, "tid": "stream 7", "ts": 1621401187229711, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24610", + "ts": 1621401187229681, "dur": 30, + "args": { + "cbid": 211, "correlation": 460, + "external id": 94, "external ts": 1621401187229459 + } + }, + { + "ph": "s", "id": 460, "pid": 24572, "tid": 24610, "ts": 1621401187229681, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AddFunctor, at::detail::Array >(int, at::native::AddFunctor, at::detail::Array)", "pid": 0, "tid": "stream 7", + "ts": 1621401187230162, "dur": 1, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 467, "external id": 98, + "registers per thread": 20, + "shared memory": 0, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [64, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 467, "pid": 0, "tid": "stream 7", "ts": 1621401187230162, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24610", + "ts": 1621401187230133, "dur": 29, + "args": { + "cbid": 211, "correlation": 467, + "external id": 98, "external ts": 1621401187230059 + } + }, + { + "ph": "s", "id": 467, "pid": 24572, "tid": 24610, "ts": 1621401187230133, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 24572, "tid": "24610", + "ts": 1621401187231063, "dur": 4, + "args": { + "cbid": 251, "correlation": 480, + "external id": 107, "external ts": 1621401187230889 + } + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 24572, "tid": "24610", + "ts": 1621401187231069, "dur": 1, + "args": { + "cbid": 251, "correlation": 481, + "external id": 107, "external ts": 1621401187230889 + } + }, + { + "ph": "X", "cat": "Kernel", + "name": "volta_sgemm_128x32_nt", "pid": 0, "tid": "stream 7", + "ts": 1621401187231100, "dur": 3, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 482, "external id": 107, + "registers per thread": 55, + "shared memory": 16384, + "blocks per SM": 0.0125, + "warps per SM": 0.1, + "grid": [1, 1, 1], + "block": [256, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 482, "pid": 0, "tid": "stream 7", "ts": 1621401187231100, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24610", + "ts": 1621401187231073, "dur": 27, + "args": { + "cbid": 211, "correlation": 482, + "external id": 107, "external ts": 1621401187230889 + } + }, + { + "ph": "s", "id": 482, "pid": 24572, "tid": 24610, "ts": 1621401187231073, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 24572, "tid": "24610", + "ts": 1621401187231658, "dur": 3, + "args": { + "cbid": 251, "correlation": 491, + "external id": 116, "external ts": 1621401187231491 + } + }, + { + "ph": "X", "cat": "Kernel", + "name": "void gemmSN_NN_kernel, cublasGemvTensorStridedBatched >(cublasGemmSmallNParams, cublasGemvTensorStridedBatched, float>)", "pid": 0, "tid": "stream 7", + "ts": 1621401187231692, "dur": 2, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 492, "external id": 116, + "registers per thread": 64, + "shared memory": 12288, + "blocks per SM": 0.05, + "warps per SM": 0.4, + "grid": [1, 4, 1], + "block": [256, 1, 1], + "theoretical occupancy %": 1 + } + }, + { + "ph": "f", "id": 492, "pid": 0, "tid": "stream 7", "ts": 1621401187231692, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24610", + "ts": 1621401187231665, "dur": 27, + "args": { + "cbid": 211, "correlation": 492, + "external id": 116, "external ts": 1621401187231491 + } + }, + { + "ph": "s", "id": 492, "pid": 24572, "tid": 24610, "ts": 1621401187231665, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AddFunctor, at::detail::Array >(int, at::native::AddFunctor, at::detail::Array)", "pid": 0, "tid": "stream 7", + "ts": 1621401187232603, "dur": 1, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 503, "external id": 126, + "registers per thread": 20, + "shared memory": 0, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [64, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 503, "pid": 0, "tid": "stream 7", "ts": 1621401187232603, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24610", + "ts": 1621401187232583, "dur": 19, + "args": { + "cbid": 211, "correlation": 503, + "external id": 126, "external ts": 1621401187232535 + } + }, + { + "ph": "s", "id": 503, "pid": 24572, "tid": 24610, "ts": 1621401187232583, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 0, "tid": "stream 7", + "ts": 1621401187232921, "dur": 1, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 513, "external id": 130, + "registers per thread": 16, + "shared memory": 0, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [64, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 513, "pid": 0, "tid": "stream 7", "ts": 1621401187232921, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24610", + "ts": 1621401187232901, "dur": 19, + "args": { + "cbid": 211, "correlation": 513, + "external id": 130, "external ts": 1621401187232866 + } + }, + { + "ph": "s", "id": 513, "pid": 24572, "tid": 24610, "ts": 1621401187232901, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 0, "tid": "stream 7", + "ts": 1621401187233342, "dur": 1, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 526, "external id": 133, + "registers per thread": 16, + "shared memory": 0, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [64, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 526, "pid": 0, "tid": "stream 7", "ts": 1621401187233342, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24610", + "ts": 1621401187233323, "dur": 18, + "args": { + "cbid": 211, "correlation": 526, + "external id": 133, "external ts": 1621401187233168 + } + }, + { + "ph": "s", "id": 526, "pid": 24572, "tid": 24610, "ts": 1621401187233323, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::unrolled_elementwise_kernel, OffsetCalculator<3, unsigned int>, at::detail::Array<1, unsigned int>, at::native::memory::LoadWithoutCast, OffsetCalculator::StoreWithoutCast>(int, at::native::(anonymous namespace)::where_kernel_impl(at::TensorIterator&, c10::ScalarType)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(bool, float, float)#1}, at::detail::Array, OffsetCalculator<3, unsigned int>, at::detail::Array<1, unsigned int>, at::native::memory::LoadWithoutCast, OffsetCalculator::StoreWithoutCast)", "pid": 0, "tid": "stream 7", + "ts": 1621401187233770, "dur": 2, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 535, "external id": 144, + "registers per thread": 26, + "shared memory": 0, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [64, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 535, "pid": 0, "tid": "stream 7", "ts": 1621401187233770, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24610", + "ts": 1621401187233751, "dur": 19, + "args": { + "cbid": 211, "correlation": 535, + "external id": 144, "external ts": 1621401187233620 + } + }, + { + "ph": "s", "id": 535, "pid": 24572, "tid": 24610, "ts": 1621401187233751, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": "stream 7", + "ts": 1621401187234156, "dur": 3, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 548, "external id": 147, + "registers per thread": 32, + "shared memory": 16, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [4, 16, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 548, "pid": 0, "tid": "stream 7", "ts": 1621401187234156, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24610", + "ts": 1621401187234135, "dur": 19, + "args": { + "cbid": 211, "correlation": 548, + "external id": 147, "external ts": 1621401187233990 + } + }, + { + "ph": "s", "id": 548, "pid": 24572, "tid": 24610, "ts": 1621401187234135, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AddFunctor, at::detail::Array >(int, at::native::AddFunctor, at::detail::Array)", "pid": 0, "tid": "stream 7", + "ts": 1621401187234445, "dur": 1, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 555, "external id": 151, + "registers per thread": 20, + "shared memory": 0, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [64, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 555, "pid": 0, "tid": "stream 7", "ts": 1621401187234445, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24610", + "ts": 1621401187234425, "dur": 19, + "args": { + "cbid": 211, "correlation": 555, + "external id": 151, "external ts": 1621401187234378 + } + }, + { + "ph": "s", "id": 555, "pid": 24572, "tid": 24610, "ts": 1621401187234425, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 24572, "tid": "24610", + "ts": 1621401187235000, "dur": 2, + "args": { + "cbid": 251, "correlation": 568, + "external id": 160, "external ts": 1621401187234890 + } + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 24572, "tid": "24610", + "ts": 1621401187235004, "dur": 0, + "args": { + "cbid": 251, "correlation": 569, + "external id": 160, "external ts": 1621401187234890 + } + }, + { + "ph": "X", "cat": "Kernel", + "name": "volta_sgemm_128x32_nt", "pid": 0, "tid": "stream 7", + "ts": 1621401187235025, "dur": 3, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 570, "external id": 160, + "registers per thread": 55, + "shared memory": 16384, + "blocks per SM": 0.0125, + "warps per SM": 0.1, + "grid": [1, 1, 1], + "block": [256, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 570, "pid": 0, "tid": "stream 7", "ts": 1621401187235025, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24610", + "ts": 1621401187235006, "dur": 17, + "args": { + "cbid": 211, "correlation": 570, + "external id": 160, "external ts": 1621401187234890 + } + }, + { + "ph": "s", "id": 570, "pid": 24572, "tid": 24610, "ts": 1621401187235006, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AddFunctor, at::detail::Array >(int, at::native::AddFunctor, at::detail::Array)", "pid": 0, "tid": "stream 7", + "ts": 1621401187235555, "dur": 1, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 579, "external id": 170, + "registers per thread": 20, + "shared memory": 0, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [64, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 579, "pid": 0, "tid": "stream 7", "ts": 1621401187235555, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24610", + "ts": 1621401187235535, "dur": 19, + "args": { + "cbid": 211, "correlation": 579, + "external id": 170, "external ts": 1621401187235487 + } + }, + { + "ph": "s", "id": 579, "pid": 24572, "tid": 24610, "ts": 1621401187235535, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AddFunctor, at::detail::Array >(int, at::native::AddFunctor, at::detail::Array)", "pid": 0, "tid": "stream 7", + "ts": 1621401187236158, "dur": 1, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 585, "external id": 176, + "registers per thread": 20, + "shared memory": 0, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [64, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 585, "pid": 0, "tid": "stream 7", "ts": 1621401187236158, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24572", + "ts": 1621401187236138, "dur": 18, + "args": { + "cbid": 211, "correlation": 585, + "external id": 176, "external ts": 1621401187236091 + } + }, + { + "ph": "s", "id": 585, "pid": 24572, "tid": 24572, "ts": 1621401187236138, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AddFunctor, at::detail::Array >(int, at::native::AddFunctor, at::detail::Array)", "pid": 0, "tid": "stream 7", + "ts": 1621401187236278, "dur": 1, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 590, "external id": 177, + "registers per thread": 20, + "shared memory": 0, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [64, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 590, "pid": 0, "tid": "stream 7", "ts": 1621401187236278, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24572", + "ts": 1621401187236261, "dur": 15, + "args": { + "cbid": 211, "correlation": 590, + "external id": 177, "external ts": 1621401187236221 + } + }, + { + "ph": "s", "id": 590, "pid": 24572, "tid": 24572, "ts": 1621401187236261, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AddFunctor, at::detail::Array >(int, at::native::AddFunctor, at::detail::Array)", "pid": 0, "tid": "stream 7", + "ts": 1621401187236390, "dur": 1, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 595, "external id": 178, + "registers per thread": 20, + "shared memory": 0, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [64, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 595, "pid": 0, "tid": "stream 7", "ts": 1621401187236390, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24572", + "ts": 1621401187236373, "dur": 15, + "args": { + "cbid": 211, "correlation": 595, + "external id": 178, "external ts": 1621401187236334 + } + }, + { + "ph": "s", "id": 595, "pid": 24572, "tid": 24572, "ts": 1621401187236373, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AddFunctor, at::detail::Array >(int, at::native::AddFunctor, at::detail::Array)", "pid": 0, "tid": "stream 7", + "ts": 1621401187236501, "dur": 1, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 600, "external id": 179, + "registers per thread": 20, + "shared memory": 0, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [64, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 600, "pid": 0, "tid": "stream 7", "ts": 1621401187236501, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24572", + "ts": 1621401187236483, "dur": 15, + "args": { + "cbid": 211, "correlation": 600, + "external id": 179, "external ts": 1621401187236444 + } + }, + { + "ph": "s", "id": 600, "pid": 24572, "tid": 24572, "ts": 1621401187236483, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaDeviceSynchronize", "pid": 24572, "tid": "24572", + "ts": 1621401187236853, "dur": 10, + "args": { + "cbid": 165, "correlation": 605, + "external id": 0, "external ts": 0 + } + }, + { + "name": "process_name", "ph": "M", "ts": 1621401187223005, "pid": 24572, "tid": 0, + "args": { + "name": "python" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 1621401187223005, "pid": 24572, "tid": 0, + "args": { + "labels": "CPU" + } + }, + { + "name": "process_name", "ph": "M", "ts": 1621401187223005, "pid": 0, "tid": 0, + "args": { + "name": "python" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 1621401187223005, "pid": 0, "tid": 0, + "args": { + "labels": "GPU 0" + } + }, + { + "name": "process_name", "ph": "M", "ts": 1621401187223005, "pid": 1, "tid": 0, + "args": { + "name": "python" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 1621401187223005, "pid": 1, "tid": 0, + "args": { + "labels": "GPU 1" + } + }, + { + "name": "process_name", "ph": "M", "ts": 1621401187223005, "pid": 2, "tid": 0, + "args": { + "name": "python" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 1621401187223005, "pid": 2, "tid": 0, + "args": { + "labels": "GPU 2" + } + }, + { + "name": "process_name", "ph": "M", "ts": 1621401187223005, "pid": 3, "tid": 0, + "args": { + "name": "python" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 1621401187223005, "pid": 3, "tid": 0, + "args": { + "labels": "GPU 3" + } + }, + { + "name": "process_name", "ph": "M", "ts": 1621401187223005, "pid": 4, "tid": 0, + "args": { + "name": "python" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 1621401187223005, "pid": 4, "tid": 0, + "args": { + "labels": "GPU 4" + } + }, + { + "name": "process_name", "ph": "M", "ts": 1621401187223005, "pid": 5, "tid": 0, + "args": { + "name": "python" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 1621401187223005, "pid": 5, "tid": 0, + "args": { + "labels": "GPU 5" + } + }, + { + "name": "process_name", "ph": "M", "ts": 1621401187223005, "pid": 6, "tid": 0, + "args": { + "name": "python" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 1621401187223005, "pid": 6, "tid": 0, + "args": { + "labels": "GPU 6" + } + }, + { + "name": "process_name", "ph": "M", "ts": 1621401187223005, "pid": 7, "tid": 0, + "args": { + "name": "python" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 1621401187223005, "pid": 7, "tid": 0, + "args": { + "labels": "GPU 7" + } + }, + { + "name": "thread_name", "ph": "M", "ts": 1621401187223005, "pid": 24572, "tid": "24610", + "args": { + "name": "thread 24610 (python)" + } + }, + { + "name": "thread_name", "ph": "M", "ts": 1621401187223005, "pid": 24572, "tid": "24572", + "args": { + "name": "thread 24572 (python)" + } + }, + { + "ph": "X", "cat": "Trace", "ts": 1621401187223005, "dur": 13896, + "pid": "Traces", "tid": "PyTorch Profiler", + "name": "PyTorch Profiler (0)", + "args": { + "Op count": 0 + } + }, + { + "name": "Iteration Start: PyTorch Profiler", "ph": "i", "s": "g", + "pid": "Traces", "tid": "Trace PyTorch Profiler", "ts": 1621401187223005 + }, + { + "name": "Record Window End", "ph": "i", "s": "g", + "pid": "", "tid": "", "ts": 1621401187237108 + } +, {"ph":"C", "name":"GPU 0 Utilization", "pid":0, "ts":1621401187223005, "args":{"GPU Utilization":1}}, {"ph":"C", "name":"GPU 0 Utilization", "pid":0, "ts":1621401187223005, "args":{"GPU Utilization":0}}, {"ph":"C", "name":"GPU 0 Utilization", "pid":0, "ts":1621401187223005, "args":{"GPU Utilization":0.0}}, {"ph":"C", "name":"GPU 0 Utilization", "pid":0, "ts":1621401187224005, "args":{"GPU Utilization":0.0}}, {"ph":"C", "name":"GPU 0 Utilization", "pid":0, "ts":1621401187225005, "args":{"GPU Utilization":0.6}}, {"ph":"C", "name":"GPU 0 Utilization", "pid":0, "ts":1621401187226005, "args":{"GPU Utilization":0.5}}, {"ph":"C", "name":"GPU 0 Utilization", "pid":0, "ts":1621401187227005, "args":{"GPU Utilization":0.6}}, {"ph":"C", "name":"GPU 0 Utilization", "pid":0, "ts":1621401187228005, "args":{"GPU Utilization":0.2}}, {"ph":"C", "name":"GPU 0 Utilization", "pid":0, "ts":1621401187229005, "args":{"GPU Utilization":0.6}}, {"ph":"C", "name":"GPU 0 Utilization", "pid":0, "ts":1621401187230005, "args":{"GPU Utilization":0.1}}, {"ph":"C", "name":"GPU 0 Utilization", "pid":0, "ts":1621401187231005, "args":{"GPU Utilization":0.5}}, {"ph":"C", "name":"GPU 0 Utilization", "pid":0, "ts":1621401187232005, "args":{"GPU Utilization":0.2}}, {"ph":"C", "name":"GPU 0 Utilization", "pid":0, "ts":1621401187233005, "args":{"GPU Utilization":0.3}}, {"ph":"C", "name":"GPU 0 Utilization", "pid":0, "ts":1621401187234005, "args":{"GPU Utilization":0.4}}, {"ph":"C", "name":"GPU 0 Utilization", "pid":0, "ts":1621401187235005, "args":{"GPU Utilization":0.4219409282700422}}, {"ph":"C", "name":"GPU 0 Utilization", "pid":0, "ts":1621401187236901, "args":{"GPU Utilization":0}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187223005, "args":{"Est. SM Efficiency":1}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187223005, "args":{"Est. SM Efficiency":0}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187225275, "args":{"Est. SM Efficiency":0.25}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187225278, "args":{"Est. SM Efficiency":0}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187225530, "args":{"Est. SM Efficiency":0.125}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187225532, "args":{"Est. SM Efficiency":0}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187225820, "args":{"Est. SM Efficiency":0.125}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187225821, "args":{"Est. SM Efficiency":0}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187226325, "args":{"Est. SM Efficiency":0.25}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187226327, "args":{"Est. SM Efficiency":0}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187226575, "args":{"Est. SM Efficiency":0.125}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187226577, "args":{"Est. SM Efficiency":0}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187226912, "args":{"Est. SM Efficiency":0.125}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187226913, "args":{"Est. SM Efficiency":0}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187227092, "args":{"Est. SM Efficiency":0.125}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187227094, "args":{"Est. SM Efficiency":0}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187227619, "args":{"Est. SM Efficiency":0.125}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187227620, "args":{"Est. SM Efficiency":0}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187227745, "args":{"Est. SM Efficiency":0.125}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187227746, "args":{"Est. SM Efficiency":0}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187227859, "args":{"Est. SM Efficiency":0.125}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187227860, "args":{"Est. SM Efficiency":0}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187227973, "args":{"Est. SM Efficiency":0.125}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187227974, "args":{"Est. SM Efficiency":0}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187228279, "args":{"Est. SM Efficiency":0.125}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187228280, "args":{"Est. SM Efficiency":0}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187228962, "args":{"Est. SM Efficiency":0.125}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187228963, "args":{"Est. SM Efficiency":0}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187229153, "args":{"Est. SM Efficiency":0.125}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187229155, "args":{"Est. SM Efficiency":0}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187229711, "args":{"Est. SM Efficiency":0.125}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187229715, "args":{"Est. SM Efficiency":0}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187230162, "args":{"Est. SM Efficiency":0.125}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187230163, "args":{"Est. SM Efficiency":0}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187231100, "args":{"Est. SM Efficiency":0.125}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187231103, "args":{"Est. SM Efficiency":0}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187231692, "args":{"Est. SM Efficiency":0.5}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187231694, "args":{"Est. SM Efficiency":0}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187232603, "args":{"Est. SM Efficiency":0.125}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187232604, "args":{"Est. SM Efficiency":0}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187232921, "args":{"Est. SM Efficiency":0.125}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187232922, "args":{"Est. SM Efficiency":0}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187233342, "args":{"Est. SM Efficiency":0.125}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187233343, "args":{"Est. SM Efficiency":0}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187233770, "args":{"Est. SM Efficiency":0.125}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187233772, "args":{"Est. SM Efficiency":0}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187234156, "args":{"Est. SM Efficiency":0.125}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187234159, "args":{"Est. SM Efficiency":0}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187234445, "args":{"Est. SM Efficiency":0.125}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187234446, "args":{"Est. SM Efficiency":0}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187235025, "args":{"Est. SM Efficiency":0.125}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187235028, "args":{"Est. SM Efficiency":0}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187235555, "args":{"Est. SM Efficiency":0.125}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187235556, "args":{"Est. SM Efficiency":0}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187236158, "args":{"Est. SM Efficiency":0.125}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187236159, "args":{"Est. SM Efficiency":0}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187236278, "args":{"Est. SM Efficiency":0.125}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187236279, "args":{"Est. SM Efficiency":0}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187236390, "args":{"Est. SM Efficiency":0.125}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187236391, "args":{"Est. SM Efficiency":0}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187236501, "args":{"Est. SM Efficiency":0.125}}, {"ph":"C", "name":"GPU 0 Est. SM Efficiency", "pid":0, "ts":1621401187236502, "args":{"Est. SM Efficiency":0}}]} \ No newline at end of file diff --git a/tb_plugins/profiling/tb_plugin/test/gpu_metrics_input.json b/tb_plugins/profiling/tb_plugin/test/gpu_metrics_input.json new file mode 100644 index 0000000000000000000000000000000000000000..71530b1d6e5602c3ef1decf2bee33c0a3f98cc1c --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/test/gpu_metrics_input.json @@ -0,0 +1,3105 @@ + +{ + "schemaVersion": 1, + + "computeProperties": [ + + { + "id": 0, "name": "Tesla V100-DGXS-32GB", "totalGlobalMem": 34084028416, + "major": 7, "minor": 0, + "maxThreadsPerBlock": 1024, "maxThreadsPerMultiProcessor": 2048, + "regsPerBlock": 65536, "regsPerMultiprocessor": 65536, "warpSize": 32, + "sharedMemPerBlock": 49152, "sharedMemPerMultiprocessor": 98304, + "multiProcessorCount": 80, "sharedMemPerBlockOptin": 98304 + }, + + { + "id": 1, "name": "Tesla V100-DGXS-32GB", "totalGlobalMem": 34087305216, + "major": 7, "minor": 0, + "maxThreadsPerBlock": 1024, "maxThreadsPerMultiProcessor": 2048, + "regsPerBlock": 65536, "regsPerMultiprocessor": 65536, "warpSize": 32, + "sharedMemPerBlock": 49152, "sharedMemPerMultiprocessor": 98304, + "multiProcessorCount": 80, "sharedMemPerBlockOptin": 98304 + }, + + { + "id": 2, "name": "Tesla V100-DGXS-32GB", "totalGlobalMem": 34087305216, + "major": 7, "minor": 0, + "maxThreadsPerBlock": 1024, "maxThreadsPerMultiProcessor": 2048, + "regsPerBlock": 65536, "regsPerMultiprocessor": 65536, "warpSize": 32, + "sharedMemPerBlock": 49152, "sharedMemPerMultiprocessor": 98304, + "multiProcessorCount": 80, "sharedMemPerBlockOptin": 98304 + }, + + { + "id": 3, "name": "Tesla V100-DGXS-32GB", "totalGlobalMem": 34087305216, + "major": 7, "minor": 0, + "maxThreadsPerBlock": 1024, "maxThreadsPerMultiProcessor": 2048, + "regsPerBlock": 65536, "regsPerMultiprocessor": 65536, "warpSize": 32, + "sharedMemPerBlock": 49152, "sharedMemPerMultiprocessor": 98304, + "multiProcessorCount": 80, "sharedMemPerBlockOptin": 98304 + } + ], + "traceEvents": [ + + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24572", + "ts": 1621401187223197, "dur": 21, + "args": { + "Device": 24572, "External id": 2, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::zero_", "pid": 24572, "tid": "24572", + "ts": 1621401187223264, "dur": 5, + "args": { + "Device": 24572, "External id": 3, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::zeros", "pid": 24572, "tid": "24572", + "ts": 1621401187223182, "dur": 99, + "args": { + "Device": 24572, "External id": 1, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24572", + "ts": 1621401187223376, "dur": 19, + "args": { + "Device": 24572, "External id": 5, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24572", + "ts": 1621401187223480, "dur": 18, + "args": { + "Device": 24572, "External id": 7, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::zero_", "pid": 24572, "tid": "24572", + "ts": 1621401187223530, "dur": 5, + "args": { + "Device": 24572, "External id": 8, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::zeros", "pid": 24572, "tid": "24572", + "ts": 1621401187223469, "dur": 72, + "args": { + "Device": 24572, "External id": 6, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24572", + "ts": 1621401187223622, "dur": 19, + "args": { + "Device": 24572, "External id": 10, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::as_strided", "pid": 24572, "tid": "24572", + "ts": 1621401187223790, "dur": 12, + "args": { + "Device": 24572, "External id": 13, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::unsqueeze", "pid": 24572, "tid": "24572", + "ts": 1621401187223777, "dur": 50, + "args": { + "Device": 24572, "External id": 12, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::as_strided", "pid": 24572, "tid": "24572", + "ts": 1621401187223850, "dur": 7, + "args": { + "Device": 24572, "External id": 15, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::unsqueeze", "pid": 24572, "tid": "24572", + "ts": 1621401187223841, "dur": 24, + "args": { + "Device": 24572, "External id": 14, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24572", + "ts": 1621401187223904, "dur": 16, + "args": { + "Device": 24572, "External id": 18, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::resize_", "pid": 24572, "tid": "24572", + "ts": 1621401187223945, "dur": 14, + "args": { + "Device": 24572, "External id": 19, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::_cat", "pid": 24572, "tid": "24572", + "ts": 1621401187223888, "dur": 87, + "args": { + "Device": 24572, "External id": 17, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::cat", "pid": 24572, "tid": "24572", + "ts": 1621401187223876, "dur": 106, + "args": { + "Device": 24572, "External id": 16, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::stack", "pid": 24572, "tid": "24572", + "ts": 1621401187223752, "dur": 245, + "args": { + "Device": 24572, "External id": 11, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 22 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::as_strided", "pid": 24572, "tid": "24572", + "ts": 1621401187224094, "dur": 12, + "args": { + "Device": 24572, "External id": 22, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::unsqueeze", "pid": 24572, "tid": "24572", + "ts": 1621401187224074, "dur": 43, + "args": { + "Device": 24572, "External id": 21, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::as_strided", "pid": 24572, "tid": "24572", + "ts": 1621401187224137, "dur": 6, + "args": { + "Device": 24572, "External id": 24, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::unsqueeze", "pid": 24572, "tid": "24572", + "ts": 1621401187224128, "dur": 21, + "args": { + "Device": 24572, "External id": 23, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24572", + "ts": 1621401187224184, "dur": 15, + "args": { + "Device": 24572, "External id": 27, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::resize_", "pid": 24572, "tid": "24572", + "ts": 1621401187224223, "dur": 12, + "args": { + "Device": 24572, "External id": 28, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::_cat", "pid": 24572, "tid": "24572", + "ts": 1621401187224169, "dur": 79, + "args": { + "Device": 24572, "External id": 26, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::cat", "pid": 24572, "tid": "24572", + "ts": 1621401187224159, "dur": 96, + "args": { + "Device": 24572, "External id": 25, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::stack", "pid": 24572, "tid": "24572", + "ts": 1621401187224056, "dur": 213, + "args": { + "Device": 24572, "External id": 20, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 22 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "enumerate(DataLoader)#_SingleProcessDataLoaderIter.__next__", "pid": 24572, "tid": "24572", + "ts": 1621401187223604, "dur": 725, + "args": { + "Device": 24572, "External id": 9, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty_strided", "pid": 24572, "tid": "24572", + "ts": 1621401187224415, "dur": 54, + "args": { + "Device": 24572, "External id": 30, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::copy_", "pid": 24572, "tid": "24572", + "ts": 1621401187224496, "dur": 80, + "args": { + "Device": 24572, "External id": 31, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 22 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::to", "pid": 24572, "tid": "24572", + "ts": 1621401187224398, "dur": 193, + "args": { + "Device": 24572, "External id": 29, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 22 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty_strided", "pid": 24572, "tid": "24572", + "ts": 1621401187224645, "dur": 51, + "args": { + "Device": 24572, "External id": 33, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::copy_", "pid": 24572, "tid": "24572", + "ts": 1621401187224720, "dur": 65, + "args": { + "Device": 24572, "External id": 34, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 22 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::to", "pid": 24572, "tid": "24572", + "ts": 1621401187224631, "dur": 168, + "args": { + "Device": 24572, "External id": 32, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 22 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::as_strided", "pid": 24572, "tid": "24572", + "ts": 1621401187224956, "dur": 14, + "args": { + "Device": 24572, "External id": 38, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::transpose", "pid": 24572, "tid": "24572", + "ts": 1621401187224945, "dur": 37, + "args": { + "Device": 24572, "External id": 37, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::t", "pid": 24572, "tid": "24572", + "ts": 1621401187224917, "dur": 101, + "args": { + "Device": 24572, "External id": 36, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 22 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::view", "pid": 24572, "tid": "24572", + "ts": 1621401187225058, "dur": 33, + "args": { + "Device": 24572, "External id": 40, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 23 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24572", + "ts": 1621401187225181, "dur": 41, + "args": { + "Device": 24572, "External id": 42, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::mm", "pid": 24572, "tid": "24572", + "ts": 1621401187225112, "dur": 197, + "args": { + "Device": 24572, "External id": 41, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 23 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::view", "pid": 24572, "tid": "24572", + "ts": 1621401187225367, "dur": 17, + "args": { + "Device": 24572, "External id": 44, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::_unsafe_view", "pid": 24572, "tid": "24572", + "ts": 1621401187225336, "dur": 79, + "args": { + "Device": 24572, "External id": 43, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 24 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::matmul", "pid": 24572, "tid": "24572", + "ts": 1621401187225037, "dur": 394, + "args": { + "Device": 24572, "External id": 39, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 23 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::add_", "pid": 24572, "tid": "24572", + "ts": 1621401187225449, "dur": 107, + "args": { + "Device": 24572, "External id": 45, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 25 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::linear", "pid": 24572, "tid": "24572", + "ts": 1621401187224907, "dur": 664, + "args": { + "Device": 24572, "External id": 35, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 22 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24572", + "ts": 1621401187225662, "dur": 25, + "args": { + "Device": 24572, "External id": 47, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::resize_", "pid": 24572, "tid": "24572", + "ts": 1621401187225746, "dur": 30, + "args": { + "Device": 24572, "External id": 50, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::clamp_min", "pid": 24572, "tid": "24572", + "ts": 1621401187225721, "dur": 105, + "args": { + "Device": 24572, "External id": 49, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::clamp", "pid": 24572, "tid": "24572", + "ts": 1621401187225709, "dur": 128, + "args": { + "Device": 24572, "External id": 48, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::clamp", "pid": 24572, "tid": "24572", + "ts": 1621401187225606, "dur": 263, + "args": { + "Device": 24572, "External id": 46, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 26 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::as_strided", "pid": 24572, "tid": "24572", + "ts": 1621401187225978, "dur": 14, + "args": { + "Device": 24572, "External id": 54, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::transpose", "pid": 24572, "tid": "24572", + "ts": 1621401187225968, "dur": 36, + "args": { + "Device": 24572, "External id": 53, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::t", "pid": 24572, "tid": "24572", + "ts": 1621401187225941, "dur": 98, + "args": { + "Device": 24572, "External id": 52, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 27 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::view", "pid": 24572, "tid": "24572", + "ts": 1621401187226077, "dur": 60, + "args": { + "Device": 24572, "External id": 56, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 28 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24572", + "ts": 1621401187226233, "dur": 41, + "args": { + "Device": 24572, "External id": 58, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::mm", "pid": 24572, "tid": "24572", + "ts": 1621401187226161, "dur": 197, + "args": { + "Device": 24572, "External id": 57, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 29 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::view", "pid": 24572, "tid": "24572", + "ts": 1621401187226416, "dur": 17, + "args": { + "Device": 24572, "External id": 60, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::_unsafe_view", "pid": 24572, "tid": "24572", + "ts": 1621401187226384, "dur": 79, + "args": { + "Device": 24572, "External id": 59, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 30 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::matmul", "pid": 24572, "tid": "24572", + "ts": 1621401187226057, "dur": 422, + "args": { + "Device": 24572, "External id": 55, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 28 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::add_", "pid": 24572, "tid": "24572", + "ts": 1621401187226497, "dur": 103, + "args": { + "Device": 24572, "External id": 61, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 31 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::linear", "pid": 24572, "tid": "24572", + "ts": 1621401187225932, "dur": 683, + "args": { + "Device": 24572, "External id": 51, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 27 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::broadcast_tensors", "pid": 24572, "tid": "24572", + "ts": 1621401187226708, "dur": 11, + "args": { + "Device": 24572, "External id": 62, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 32 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24572", + "ts": 1621401187226827, "dur": 41, + "args": { + "Device": 24572, "External id": 64, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24572", + "ts": 1621401187226955, "dur": 35, + "args": { + "Device": 24572, "External id": 66, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::as_strided", "pid": 24572, "tid": "24572", + "ts": 1621401187227020, "dur": 11, + "args": { + "Device": 24572, "External id": 67, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::sum", "pid": 24572, "tid": "24572", + "ts": 1621401187226930, "dur": 176, + "args": { + "Device": 24572, "External id": 65, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::mse_loss", "pid": 24572, "tid": "24572", + "ts": 1621401187226753, "dur": 445, + "args": { + "Device": 24572, "External id": 63, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 32 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24572", + "ts": 1621401187227327, "dur": 21, + "args": { + "Device": 24572, "External id": 69, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::zero_", "pid": 24572, "tid": "24572", + "ts": 1621401187227368, "dur": 5, + "args": { + "Device": 24572, "External id": 70, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::zeros", "pid": 24572, "tid": "24572", + "ts": 1621401187227314, "dur": 65, + "args": { + "Device": 24572, "External id": 68, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24572", + "ts": 1621401187227464, "dur": 18, + "args": { + "Device": 24572, "External id": 72, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::fill_", "pid": 24572, "tid": "24572", + "ts": 1621401187227576, "dur": 49, + "args": { + "Device": 24572, "External id": 74, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::zero_", "pid": 24572, "tid": "24572", + "ts": 1621401187227553, "dur": 97, + "args": { + "Device": 24572, "External id": 73, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 33 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::fill_", "pid": 24572, "tid": "24572", + "ts": 1621401187227707, "dur": 43, + "args": { + "Device": 24572, "External id": 76, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::zero_", "pid": 24572, "tid": "24572", + "ts": 1621401187227689, "dur": 79, + "args": { + "Device": 24572, "External id": 75, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 33 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::fill_", "pid": 24572, "tid": "24572", + "ts": 1621401187227823, "dur": 42, + "args": { + "Device": 24572, "External id": 78, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::zero_", "pid": 24572, "tid": "24572", + "ts": 1621401187227805, "dur": 77, + "args": { + "Device": 24572, "External id": 77, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 33 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::fill_", "pid": 24572, "tid": "24572", + "ts": 1621401187227937, "dur": 41, + "args": { + "Device": 24572, "External id": 80, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::zero_", "pid": 24572, "tid": "24572", + "ts": 1621401187227919, "dur": 77, + "args": { + "Device": 24572, "External id": 79, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 0, "Sequence number": 33 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "Optimizer.zero_grad#SGD.zero_grad", "pid": 24572, "tid": "24572", + "ts": 1621401187227446, "dur": 606, + "args": { + "Device": 24572, "External id": 71, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty_strided", "pid": 24572, "tid": "24572", + "ts": 1621401187228150, "dur": 53, + "args": { + "Device": 24572, "External id": 83, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty_like", "pid": 24572, "tid": "24572", + "ts": 1621401187228137, "dur": 81, + "args": { + "Device": 24572, "External id": 82, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::fill_", "pid": 24572, "tid": "24572", + "ts": 1621401187228235, "dur": 50, + "args": { + "Device": 24572, "External id": 84, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::ones_like", "pid": 24572, "tid": "24572", + "ts": 1621401187228128, "dur": 169, + "args": { + "Device": 24572, "External id": 81, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24610", + "ts": 1621401187228708, "dur": 79, + "args": { + "Device": 24572, "External id": 89, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty_like", "pid": 24572, "tid": "24610", + "ts": 1621401187228680, "dur": 146, + "args": { + "Device": 24572, "External id": 88, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::fill_", "pid": 24572, "tid": "24610", + "ts": 1621401187228885, "dur": 93, + "args": { + "Device": 24572, "External id": 91, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::zero_", "pid": 24572, "tid": "24610", + "ts": 1621401187228858, "dur": 147, + "args": { + "Device": 24572, "External id": 90, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::zeros_like", "pid": 24572, "tid": "24610", + "ts": 1621401187228647, "dur": 369, + "args": { + "Device": 24572, "External id": 87, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::mse_loss_backward", "pid": 24572, "tid": "24610", + "ts": 1621401187229048, "dur": 122, + "args": { + "Device": 24572, "External id": 92, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::mse_loss_backward", "pid": 24572, "tid": "24610", + "ts": 1621401187228603, "dur": 614, + "args": { + "Device": 24572, "External id": 86, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "MseLossBackward", "pid": 24572, "tid": "24610", + "ts": 1621401187228516, "dur": 727, + "args": { + "Device": 24572, "External id": 85, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 1, "Sequence number": 32 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "AddBackward1", "pid": 24572, "tid": "24610", + "ts": 1621401187229384, "dur": 17, + "args": { + "Device": 24572, "External id": 93, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 1, "Sequence number": 31 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24610", + "ts": 1621401187229506, "dur": 73, + "args": { + "Device": 24572, "External id": 95, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::sum", "pid": 24572, "tid": "24610", + "ts": 1621401187229459, "dur": 279, + "args": { + "Device": 24572, "External id": 94, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::view", "pid": 24572, "tid": "24610", + "ts": 1621401187229788, "dur": 65, + "args": { + "Device": 24572, "External id": 96, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::add_", "pid": 24572, "tid": "24610", + "ts": 1621401187230059, "dur": 131, + "args": { + "Device": 24572, "External id": 98, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "torch::autograd::AccumulateGrad", "pid": 24572, "tid": "24610", + "ts": 1621401187230028, "dur": 228, + "args": { + "Device": 24572, "External id": 97, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::view", "pid": 24572, "tid": "24610", + "ts": 1621401187230405, "dur": 61, + "args": { + "Device": 24572, "External id": 101, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::reshape", "pid": 24572, "tid": "24610", + "ts": 1621401187230383, "dur": 107, + "args": { + "Device": 24572, "External id": 100, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "UnsafeViewBackward", "pid": 24572, "tid": "24610", + "ts": 1621401187230354, "dur": 146, + "args": { + "Device": 24572, "External id": 99, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 1, "Sequence number": 30 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::as_strided", "pid": 24572, "tid": "24610", + "ts": 1621401187230751, "dur": 22, + "args": { + "Device": 24572, "External id": 105, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::transpose", "pid": 24572, "tid": "24610", + "ts": 1621401187230732, "dur": 65, + "args": { + "Device": 24572, "External id": 104, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::t", "pid": 24572, "tid": "24610", + "ts": 1621401187230710, "dur": 124, + "args": { + "Device": 24572, "External id": 103, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::conj", "pid": 24572, "tid": "24610", + "ts": 1621401187230862, "dur": 7, + "args": { + "Device": 24572, "External id": 106, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24610", + "ts": 1621401187230935, "dur": 73, + "args": { + "Device": 24572, "External id": 108, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::mm", "pid": 24572, "tid": "24610", + "ts": 1621401187230889, "dur": 235, + "args": { + "Device": 24572, "External id": 107, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::as_strided", "pid": 24572, "tid": "24610", + "ts": 1621401187231211, "dur": 23, + "args": { + "Device": 24572, "External id": 111, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::transpose", "pid": 24572, "tid": "24610", + "ts": 1621401187231191, "dur": 69, + "args": { + "Device": 24572, "External id": 110, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::t", "pid": 24572, "tid": "24610", + "ts": 1621401187231168, "dur": 129, + "args": { + "Device": 24572, "External id": 109, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::as_strided", "pid": 24572, "tid": "24610", + "ts": 1621401187231376, "dur": 17, + "args": { + "Device": 24572, "External id": 114, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::transpose", "pid": 24572, "tid": "24610", + "ts": 1621401187231360, "dur": 49, + "args": { + "Device": 24572, "External id": 113, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::t", "pid": 24572, "tid": "24610", + "ts": 1621401187231340, "dur": 100, + "args": { + "Device": 24572, "External id": 112, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::conj", "pid": 24572, "tid": "24610", + "ts": 1621401187231465, "dur": 6, + "args": { + "Device": 24572, "External id": 115, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24610", + "ts": 1621401187231534, "dur": 72, + "args": { + "Device": 24572, "External id": 117, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::mm", "pid": 24572, "tid": "24610", + "ts": 1621401187231491, "dur": 225, + "args": { + "Device": 24572, "External id": 116, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "MmBackward", "pid": 24572, "tid": "24610", + "ts": 1621401187230626, "dur": 1124, + "args": { + "Device": 24572, "External id": 102, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 1, "Sequence number": 29 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::view", "pid": 24572, "tid": "24610", + "ts": 1621401187231992, "dur": 61, + "args": { + "Device": 24572, "External id": 120, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::reshape", "pid": 24572, "tid": "24610", + "ts": 1621401187231970, "dur": 108, + "args": { + "Device": 24572, "External id": 119, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "ViewBackward", "pid": 24572, "tid": "24610", + "ts": 1621401187231941, "dur": 166, + "args": { + "Device": 24572, "External id": 118, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 1, "Sequence number": 28 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::as_strided", "pid": 24572, "tid": "24610", + "ts": 1621401187232305, "dur": 21, + "args": { + "Device": 24572, "External id": 124, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::transpose", "pid": 24572, "tid": "24610", + "ts": 1621401187232286, "dur": 62, + "args": { + "Device": 24572, "External id": 123, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::t", "pid": 24572, "tid": "24610", + "ts": 1621401187232265, "dur": 123, + "args": { + "Device": 24572, "External id": 122, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "TBackward", "pid": 24572, "tid": "24610", + "ts": 1621401187232239, "dur": 161, + "args": { + "Device": 24572, "External id": 121, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 1, "Sequence number": 27 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::add_", "pid": 24572, "tid": "24610", + "ts": 1621401187232535, "dur": 85, + "args": { + "Device": 24572, "External id": 126, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "torch::autograd::AccumulateGrad", "pid": 24572, "tid": "24610", + "ts": 1621401187232515, "dur": 148, + "args": { + "Device": 24572, "External id": 125, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24610", + "ts": 1621401187232790, "dur": 47, + "args": { + "Device": 24572, "External id": 129, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::fill_", "pid": 24572, "tid": "24610", + "ts": 1621401187232866, "dur": 68, + "args": { + "Device": 24572, "External id": 130, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::scalar_tensor", "pid": 24572, "tid": "24610", + "ts": 1621401187232776, "dur": 174, + "args": { + "Device": 24572, "External id": 128, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24610", + "ts": 1621401187233023, "dur": 27, + "args": { + "Device": 24572, "External id": 132, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::_local_scalar_dense", "pid": 24572, "tid": "24610", + "ts": 1621401187233192, "dur": 6, + "args": { + "Device": 24572, "External id": 135, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::item", "pid": 24572, "tid": "24610", + "ts": 1621401187233184, "dur": 24, + "args": { + "Device": 24572, "External id": 134, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::resize_", "pid": 24572, "tid": "24610", + "ts": 1621401187233251, "dur": 41, + "args": { + "Device": 24572, "External id": 136, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::ge", "pid": 24572, "tid": "24610", + "ts": 1621401187233168, "dur": 182, + "args": { + "Device": 24572, "External id": 133, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::ge", "pid": 24572, "tid": "24610", + "ts": 1621401187232971, "dur": 404, + "args": { + "Device": 24572, "External id": 131, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::as_strided", "pid": 24572, "tid": "24610", + "ts": 1621401187233430, "dur": 15, + "args": { + "Device": 24572, "External id": 139, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::expand", "pid": 24572, "tid": "24610", + "ts": 1621401187233414, "dur": 62, + "args": { + "Device": 24572, "External id": 138, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::as_strided", "pid": 24572, "tid": "24610", + "ts": 1621401187233508, "dur": 10, + "args": { + "Device": 24572, "External id": 141, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::expand", "pid": 24572, "tid": "24610", + "ts": 1621401187233494, "dur": 48, + "args": { + "Device": 24572, "External id": 140, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::as_strided", "pid": 24572, "tid": "24610", + "ts": 1621401187233571, "dur": 10, + "args": { + "Device": 24572, "External id": 143, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::expand", "pid": 24572, "tid": "24610", + "ts": 1621401187233558, "dur": 43, + "args": { + "Device": 24572, "External id": 142, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24610", + "ts": 1621401187233649, "dur": 46, + "args": { + "Device": 24572, "External id": 145, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::_s_where", "pid": 24572, "tid": "24610", + "ts": 1621401187233620, "dur": 167, + "args": { + "Device": 24572, "External id": 144, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::where", "pid": 24572, "tid": "24610", + "ts": 1621401187233398, "dur": 409, + "args": { + "Device": 24572, "External id": 137, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "ClampBackward1", "pid": 24572, "tid": "24610", + "ts": 1621401187232724, "dur": 1110, + "args": { + "Device": 24572, "External id": 127, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 1, "Sequence number": 26 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "AddBackward1", "pid": 24572, "tid": "24610", + "ts": 1621401187233941, "dur": 12, + "args": { + "Device": 24572, "External id": 146, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 1, "Sequence number": 25 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24610", + "ts": 1621401187234021, "dur": 46, + "args": { + "Device": 24572, "External id": 148, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::sum", "pid": 24572, "tid": "24610", + "ts": 1621401187233990, "dur": 182, + "args": { + "Device": 24572, "External id": 147, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::view", "pid": 24572, "tid": "24610", + "ts": 1621401187234208, "dur": 43, + "args": { + "Device": 24572, "External id": 149, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::add_", "pid": 24572, "tid": "24610", + "ts": 1621401187234378, "dur": 84, + "args": { + "Device": 24572, "External id": 151, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "torch::autograd::AccumulateGrad", "pid": 24572, "tid": "24610", + "ts": 1621401187234357, "dur": 144, + "args": { + "Device": 24572, "External id": 150, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::view", "pid": 24572, "tid": "24610", + "ts": 1621401187234593, "dur": 39, + "args": { + "Device": 24572, "External id": 154, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::reshape", "pid": 24572, "tid": "24610", + "ts": 1621401187234580, "dur": 67, + "args": { + "Device": 24572, "External id": 153, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "UnsafeViewBackward", "pid": 24572, "tid": "24610", + "ts": 1621401187234561, "dur": 92, + "args": { + "Device": 24572, "External id": 152, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 1, "Sequence number": 24 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::as_strided", "pid": 24572, "tid": "24610", + "ts": 1621401187234803, "dur": 14, + "args": { + "Device": 24572, "External id": 158, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::transpose", "pid": 24572, "tid": "24610", + "ts": 1621401187234792, "dur": 41, + "args": { + "Device": 24572, "External id": 157, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::t", "pid": 24572, "tid": "24610", + "ts": 1621401187234778, "dur": 79, + "args": { + "Device": 24572, "External id": 156, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::conj", "pid": 24572, "tid": "24610", + "ts": 1621401187234874, "dur": 4, + "args": { + "Device": 24572, "External id": 159, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24610", + "ts": 1621401187234918, "dur": 47, + "args": { + "Device": 24572, "External id": 161, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::mm", "pid": 24572, "tid": "24610", + "ts": 1621401187234890, "dur": 149, + "args": { + "Device": 24572, "External id": 160, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::as_strided", "pid": 24572, "tid": "24610", + "ts": 1621401187235092, "dur": 15, + "args": { + "Device": 24572, "External id": 164, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::transpose", "pid": 24572, "tid": "24610", + "ts": 1621401187235080, "dur": 39, + "args": { + "Device": 24572, "External id": 163, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::t", "pid": 24572, "tid": "24610", + "ts": 1621401187235067, "dur": 75, + "args": { + "Device": 24572, "External id": 162, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "MmBackward", "pid": 24572, "tid": "24610", + "ts": 1621401187234734, "dur": 424, + "args": { + "Device": 24572, "External id": 155, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 1, "Sequence number": 23 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::as_strided", "pid": 24572, "tid": "24610", + "ts": 1621401187235312, "dur": 13, + "args": { + "Device": 24572, "External id": 168, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::transpose", "pid": 24572, "tid": "24610", + "ts": 1621401187235301, "dur": 40, + "args": { + "Device": 24572, "External id": 167, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::t", "pid": 24572, "tid": "24610", + "ts": 1621401187235288, "dur": 78, + "args": { + "Device": 24572, "External id": 166, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "TBackward", "pid": 24572, "tid": "24610", + "ts": 1621401187235271, "dur": 103, + "args": { + "Device": 24572, "External id": 165, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 , + "Fwd thread id": 1, "Sequence number": 22 + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::add_", "pid": 24572, "tid": "24610", + "ts": 1621401187235487, "dur": 85, + "args": { + "Device": 24572, "External id": 170, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "torch::autograd::AccumulateGrad", "pid": 24572, "tid": "24610", + "ts": 1621401187235467, "dur": 147, + "args": { + "Device": 24572, "External id": 169, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24572", + "ts": 1621401187235803, "dur": 24, + "args": { + "Device": 24572, "External id": 172, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::zero_", "pid": 24572, "tid": "24572", + "ts": 1621401187235850, "dur": 5, + "args": { + "Device": 24572, "External id": 173, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::zeros", "pid": 24572, "tid": "24572", + "ts": 1621401187235787, "dur": 75, + "args": { + "Device": 24572, "External id": 171, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::empty", "pid": 24572, "tid": "24572", + "ts": 1621401187235954, "dur": 20, + "args": { + "Device": 24572, "External id": 175, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::add_", "pid": 24572, "tid": "24572", + "ts": 1621401187236091, "dur": 82, + "args": { + "Device": 24572, "External id": 176, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::add_", "pid": 24572, "tid": "24572", + "ts": 1621401187236221, "dur": 70, + "args": { + "Device": 24572, "External id": 177, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::add_", "pid": 24572, "tid": "24572", + "ts": 1621401187236334, "dur": 68, + "args": { + "Device": 24572, "External id": 178, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::add_", "pid": 24572, "tid": "24572", + "ts": 1621401187236444, "dur": 68, + "args": { + "Device": 24572, "External id": 179, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "Optimizer.step#SGD.step", "pid": 24572, "tid": "24572", + "ts": 1621401187235935, "dur": 663, + "args": { + "Device": 24572, "External id": 174, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Operator", + "name": "ProfilerStep#2", "pid": 24572, "tid": "24572", + "ts": 1621401187223358, "dur": 13410, + "args": { + "Device": 24572, "External id": 4, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + + } + }, + { + "ph": "X", "cat": "Memcpy", + "name": "Memcpy HtoD (Pageable -> Device)", "pid": 0, "tid": "stream 7", + "ts": 1621401187224556, "dur": 1, + "args": { + "device": 0, "context": 1, + "stream": 7, "correlation": 311, "external id": 31, + "bytes": 640, "memory bandwidth (GB/s)": 0.46511627906976744 + } + }, + { + "ph": "f", "id": 311, "pid": 0, "tid": "stream 7", "ts": 1621401187224556, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaMemcpyAsync", "pid": 24572, "tid": "24572", + "ts": 1621401187224533, "dur": 20, + "args": { + "cbid": 41, "correlation": 311, + "external id": 31, "external ts": 1621401187224496 + } + }, + { + "ph": "s", "id": 311, "pid": 24572, "tid": 24572, "ts": 1621401187224533, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaStreamSynchronize", "pid": 24572, "tid": "24572", + "ts": 1621401187224554, "dur": 8, + "args": { + "cbid": 131, "correlation": 312, + "external id": 31, "external ts": 1621401187224496 + } + }, + { + "ph": "X", "cat": "Memcpy", + "name": "Memcpy HtoD (Pageable -> Device)", "pid": 0, "tid": "stream 7", + "ts": 1621401187224767, "dur": 1, + "args": { + "device": 0, "context": 1, + "stream": 7, "correlation": 323, "external id": 34, + "bytes": 128, "memory bandwidth (GB/s)": 0.09523809523809523 + } + }, + { + "ph": "f", "id": 323, "pid": 0, "tid": "stream 7", "ts": 1621401187224767, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaMemcpyAsync", "pid": 24572, "tid": "24572", + "ts": 1621401187224752, "dur": 12, + "args": { + "cbid": 41, "correlation": 323, + "external id": 34, "external ts": 1621401187224720 + } + }, + { + "ph": "s", "id": 323, "pid": 24572, "tid": 24572, "ts": 1621401187224752, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaStreamSynchronize", "pid": 24572, "tid": "24572", + "ts": 1621401187224765, "dur": 7, + "args": { + "cbid": 131, "correlation": 324, + "external id": 34, "external ts": 1621401187224720 + } + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 24572, "tid": "24572", + "ts": 1621401187225253, "dur": 2, + "args": { + "cbid": 251, "correlation": 332, + "external id": 41, "external ts": 1621401187225112 + } + }, + { + "ph": "X", "cat": "Kernel", + "name": "void gemmSN_TN_kernel_64addr, cublasGemvTensorStridedBatched >(cublasGemmSmallNParams, cublasGemvTensorStridedBatched, float>)", "pid": 0, "tid": "stream 7", + "ts": 1621401187225275, "dur": 3, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 333, "external id": 41, + "registers per thread": 72, + "shared memory": 13824, + "blocks per SM": 0.025, + "warps per SM": 0.1, + "grid": [1, 2, 1], + "block": [128, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 333, "pid": 0, "tid": "stream 7", "ts": 1621401187225275, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24572", + "ts": 1621401187225258, "dur": 16, + "args": { + "cbid": 211, "correlation": 333, + "external id": 41, "external ts": 1621401187225112 + } + }, + { + "ph": "s", "id": 333, "pid": 24572, "tid": 24572, "ts": 1621401187225258, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::unrolled_elementwise_kernel, at::detail::Array, OffsetCalculator<2, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, at::native::AddFunctor, at::detail::Array, OffsetCalculator<2, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast)", "pid": 0, "tid": "stream 7", + "ts": 1621401187225530, "dur": 2, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 338, "external id": 45, + "registers per thread": 22, + "shared memory": 0, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [64, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 338, "pid": 0, "tid": "stream 7", "ts": 1621401187225530, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24572", + "ts": 1621401187225512, "dur": 16, + "args": { + "cbid": 211, "correlation": 338, + "external id": 45, "external ts": 1621401187225449 + } + }, + { + "ph": "s", "id": 338, "pid": 24572, "tid": 24572, "ts": 1621401187225512, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::clamp_min_scalar_kernel_impl(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::clamp_min_scalar_kernel_impl(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array)", "pid": 0, "tid": "stream 7", + "ts": 1621401187225820, "dur": 1, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 352, "external id": 49, + "registers per thread": 18, + "shared memory": 0, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [64, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 352, "pid": 0, "tid": "stream 7", "ts": 1621401187225820, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24572", + "ts": 1621401187225803, "dur": 15, + "args": { + "cbid": 211, "correlation": 352, + "external id": 49, "external ts": 1621401187225721 + } + }, + { + "ph": "s", "id": 352, "pid": 24572, "tid": 24572, "ts": 1621401187225803, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 24572, "tid": "24572", + "ts": 1621401187226305, "dur": 2, + "args": { + "cbid": 251, "correlation": 363, + "external id": 57, "external ts": 1621401187226161 + } + }, + { + "ph": "X", "cat": "Kernel", + "name": "void gemmSN_TN_kernel_64addr, cublasGemvTensorStridedBatched >(cublasGemmSmallNParams, cublasGemvTensorStridedBatched, float>)", "pid": 0, "tid": "stream 7", + "ts": 1621401187226325, "dur": 2, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 364, "external id": 57, + "registers per thread": 72, + "shared memory": 13824, + "blocks per SM": 0.025, + "warps per SM": 0.1, + "grid": [1, 2, 1], + "block": [128, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 364, "pid": 0, "tid": "stream 7", "ts": 1621401187226325, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24572", + "ts": 1621401187226309, "dur": 15, + "args": { + "cbid": 211, "correlation": 364, + "external id": 57, "external ts": 1621401187226161 + } + }, + { + "ph": "s", "id": 364, "pid": 24572, "tid": 24572, "ts": 1621401187226309, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::unrolled_elementwise_kernel, at::detail::Array, OffsetCalculator<2, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, at::native::AddFunctor, at::detail::Array, OffsetCalculator<2, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast)", "pid": 0, "tid": "stream 7", + "ts": 1621401187226575, "dur": 2, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 369, "external id": 61, + "registers per thread": 22, + "shared memory": 0, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [64, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 369, "pid": 0, "tid": "stream 7", "ts": 1621401187226575, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24572", + "ts": 1621401187226558, "dur": 15, + "args": { + "cbid": 211, "correlation": 369, + "external id": 61, "external ts": 1621401187226497 + } + }, + { + "ph": "s", "id": 369, "pid": 24572, "tid": 24572, "ts": 1621401187226558, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::vectorized_elementwise_kernel<4, at::native::mse_kernel_cuda(at::TensorIterator&)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(float, float)#1}, at::detail::Array >(int, at::native::mse_kernel_cuda(at::TensorIterator&)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(float, float)#1}, at::detail::Array)", "pid": 0, "tid": "stream 7", + "ts": 1621401187226912, "dur": 1, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 377, "external id": 63, + "registers per thread": 20, + "shared memory": 0, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [64, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 377, "pid": 0, "tid": "stream 7", "ts": 1621401187226912, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24572", + "ts": 1621401187226895, "dur": 16, + "args": { + "cbid": 211, "correlation": 377, + "external id": 63, "external ts": 1621401187226753 + } + }, + { + "ph": "s", "id": 377, "pid": 24572, "tid": 24572, "ts": 1621401187226895, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": "stream 7", + "ts": 1621401187227092, "dur": 2, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 388, "external id": 65, + "registers per thread": 32, + "shared memory": 16, + "blocks per SM": 0.0125, + "warps per SM": 0.0125, + "grid": [1, 1, 1], + "block": [32, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 388, "pid": 0, "tid": "stream 7", "ts": 1621401187227092, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24572", + "ts": 1621401187227075, "dur": 15, + "args": { + "cbid": 211, "correlation": 388, + "external id": 65, "external ts": 1621401187226930 + } + }, + { + "ph": "s", "id": 388, "pid": 24572, "tid": 24572, "ts": 1621401187227075, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 0, "tid": "stream 7", + "ts": 1621401187227619, "dur": 1, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 395, "external id": 74, + "registers per thread": 16, + "shared memory": 0, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [64, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 395, "pid": 0, "tid": "stream 7", "ts": 1621401187227619, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24572", + "ts": 1621401187227601, "dur": 16, + "args": { + "cbid": 211, "correlation": 395, + "external id": 74, "external ts": 1621401187227576 + } + }, + { + "ph": "s", "id": 395, "pid": 24572, "tid": 24572, "ts": 1621401187227601, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 0, "tid": "stream 7", + "ts": 1621401187227745, "dur": 1, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 402, "external id": 76, + "registers per thread": 16, + "shared memory": 0, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [64, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 402, "pid": 0, "tid": "stream 7", "ts": 1621401187227745, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24572", + "ts": 1621401187227729, "dur": 14, + "args": { + "cbid": 211, "correlation": 402, + "external id": 76, "external ts": 1621401187227707 + } + }, + { + "ph": "s", "id": 402, "pid": 24572, "tid": 24572, "ts": 1621401187227729, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 0, "tid": "stream 7", + "ts": 1621401187227859, "dur": 1, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 409, "external id": 78, + "registers per thread": 16, + "shared memory": 0, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [64, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 409, "pid": 0, "tid": "stream 7", "ts": 1621401187227859, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24572", + "ts": 1621401187227844, "dur": 13, + "args": { + "cbid": 211, "correlation": 409, + "external id": 78, "external ts": 1621401187227823 + } + }, + { + "ph": "s", "id": 409, "pid": 24572, "tid": 24572, "ts": 1621401187227844, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 0, "tid": "stream 7", + "ts": 1621401187227973, "dur": 1, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 416, "external id": 80, + "registers per thread": 16, + "shared memory": 0, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [64, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 416, "pid": 0, "tid": "stream 7", "ts": 1621401187227973, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24572", + "ts": 1621401187227958, "dur": 13, + "args": { + "cbid": 211, "correlation": 416, + "external id": 80, "external ts": 1621401187227937 + } + }, + { + "ph": "s", "id": 416, "pid": 24572, "tid": 24572, "ts": 1621401187227958, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 0, "tid": "stream 7", + "ts": 1621401187228279, "dur": 1, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 429, "external id": 84, + "registers per thread": 16, + "shared memory": 0, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [64, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 429, "pid": 0, "tid": "stream 7", "ts": 1621401187228279, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24572", + "ts": 1621401187228262, "dur": 15, + "args": { + "cbid": 211, "correlation": 429, + "external id": 84, "external ts": 1621401187228235 + } + }, + { + "ph": "s", "id": 429, "pid": 24572, "tid": 24572, "ts": 1621401187228262, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 0, "tid": "stream 7", + "ts": 1621401187228962, "dur": 1, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 440, "external id": 91, + "registers per thread": 16, + "shared memory": 0, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [64, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 440, "pid": 0, "tid": "stream 7", "ts": 1621401187228962, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24610", + "ts": 1621401187228932, "dur": 30, + "args": { + "cbid": 211, "correlation": 440, + "external id": 91, "external ts": 1621401187228885 + } + }, + { + "ph": "s", "id": 440, "pid": 24572, "tid": 24610, "ts": 1621401187228932, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::unrolled_elementwise_kernel, OffsetCalculator<3, unsigned int>, at::detail::Array<1, unsigned int>, at::native::memory::LoadWithoutCast, OffsetCalculator::StoreWithoutCast>(int, at::native::mse_backward_cuda_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(float, float, float)#1}, at::detail::Array, OffsetCalculator<3, unsigned int>, at::detail::Array<1, unsigned int>, at::native::memory::LoadWithoutCast, OffsetCalculator::StoreWithoutCast)", "pid": 0, "tid": "stream 7", + "ts": 1621401187229153, "dur": 2, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 446, "external id": 92, + "registers per thread": 28, + "shared memory": 0, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [64, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 446, "pid": 0, "tid": "stream 7", "ts": 1621401187229153, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24610", + "ts": 1621401187229127, "dur": 26, + "args": { + "cbid": 211, "correlation": 446, + "external id": 92, "external ts": 1621401187229048 + } + }, + { + "ph": "s", "id": 446, "pid": 24572, "tid": 24610, "ts": 1621401187229127, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::reduce_kernel<256, 2, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": "stream 7", + "ts": 1621401187229711, "dur": 4, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 460, "external id": 94, + "registers per thread": 35, + "shared memory": 16, + "blocks per SM": 0.0125, + "warps per SM": 0.00625, + "grid": [1, 1, 1], + "block": [1, 16, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 460, "pid": 0, "tid": "stream 7", "ts": 1621401187229711, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24610", + "ts": 1621401187229681, "dur": 30, + "args": { + "cbid": 211, "correlation": 460, + "external id": 94, "external ts": 1621401187229459 + } + }, + { + "ph": "s", "id": 460, "pid": 24572, "tid": 24610, "ts": 1621401187229681, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AddFunctor, at::detail::Array >(int, at::native::AddFunctor, at::detail::Array)", "pid": 0, "tid": "stream 7", + "ts": 1621401187230162, "dur": 1, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 467, "external id": 98, + "registers per thread": 20, + "shared memory": 0, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [64, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 467, "pid": 0, "tid": "stream 7", "ts": 1621401187230162, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24610", + "ts": 1621401187230133, "dur": 29, + "args": { + "cbid": 211, "correlation": 467, + "external id": 98, "external ts": 1621401187230059 + } + }, + { + "ph": "s", "id": 467, "pid": 24572, "tid": 24610, "ts": 1621401187230133, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 24572, "tid": "24610", + "ts": 1621401187231063, "dur": 4, + "args": { + "cbid": 251, "correlation": 480, + "external id": 107, "external ts": 1621401187230889 + } + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 24572, "tid": "24610", + "ts": 1621401187231069, "dur": 1, + "args": { + "cbid": 251, "correlation": 481, + "external id": 107, "external ts": 1621401187230889 + } + }, + { + "ph": "X", "cat": "Kernel", + "name": "volta_sgemm_128x32_nt", "pid": 0, "tid": "stream 7", + "ts": 1621401187231100, "dur": 3, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 482, "external id": 107, + "registers per thread": 55, + "shared memory": 16384, + "blocks per SM": 0.0125, + "warps per SM": 0.1, + "grid": [1, 1, 1], + "block": [256, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 482, "pid": 0, "tid": "stream 7", "ts": 1621401187231100, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24610", + "ts": 1621401187231073, "dur": 27, + "args": { + "cbid": 211, "correlation": 482, + "external id": 107, "external ts": 1621401187230889 + } + }, + { + "ph": "s", "id": 482, "pid": 24572, "tid": 24610, "ts": 1621401187231073, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 24572, "tid": "24610", + "ts": 1621401187231658, "dur": 3, + "args": { + "cbid": 251, "correlation": 491, + "external id": 116, "external ts": 1621401187231491 + } + }, + { + "ph": "X", "cat": "Kernel", + "name": "void gemmSN_NN_kernel, cublasGemvTensorStridedBatched >(cublasGemmSmallNParams, cublasGemvTensorStridedBatched, float>)", "pid": 0, "tid": "stream 7", + "ts": 1621401187231692, "dur": 2, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 492, "external id": 116, + "registers per thread": 64, + "shared memory": 12288, + "blocks per SM": 0.05, + "warps per SM": 0.4, + "grid": [1, 4, 1], + "block": [256, 1, 1], + "theoretical occupancy %": 1 + } + }, + { + "ph": "f", "id": 492, "pid": 0, "tid": "stream 7", "ts": 1621401187231692, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24610", + "ts": 1621401187231665, "dur": 27, + "args": { + "cbid": 211, "correlation": 492, + "external id": 116, "external ts": 1621401187231491 + } + }, + { + "ph": "s", "id": 492, "pid": 24572, "tid": 24610, "ts": 1621401187231665, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AddFunctor, at::detail::Array >(int, at::native::AddFunctor, at::detail::Array)", "pid": 0, "tid": "stream 7", + "ts": 1621401187232603, "dur": 1, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 503, "external id": 126, + "registers per thread": 20, + "shared memory": 0, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [64, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 503, "pid": 0, "tid": "stream 7", "ts": 1621401187232603, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24610", + "ts": 1621401187232583, "dur": 19, + "args": { + "cbid": 211, "correlation": 503, + "external id": 126, "external ts": 1621401187232535 + } + }, + { + "ph": "s", "id": 503, "pid": 24572, "tid": 24610, "ts": 1621401187232583, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "pid": 0, "tid": "stream 7", + "ts": 1621401187232921, "dur": 1, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 513, "external id": 130, + "registers per thread": 16, + "shared memory": 0, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [64, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 513, "pid": 0, "tid": "stream 7", "ts": 1621401187232921, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24610", + "ts": 1621401187232901, "dur": 19, + "args": { + "cbid": 211, "correlation": 513, + "external id": 130, "external ts": 1621401187232866 + } + }, + { + "ph": "s", "id": 513, "pid": 24572, "tid": 24610, "ts": 1621401187232901, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "pid": 0, "tid": "stream 7", + "ts": 1621401187233342, "dur": 1, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 526, "external id": 133, + "registers per thread": 16, + "shared memory": 0, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [64, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 526, "pid": 0, "tid": "stream 7", "ts": 1621401187233342, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24610", + "ts": 1621401187233323, "dur": 18, + "args": { + "cbid": 211, "correlation": 526, + "external id": 133, "external ts": 1621401187233168 + } + }, + { + "ph": "s", "id": 526, "pid": 24572, "tid": 24610, "ts": 1621401187233323, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::unrolled_elementwise_kernel, OffsetCalculator<3, unsigned int>, at::detail::Array<1, unsigned int>, at::native::memory::LoadWithoutCast, OffsetCalculator::StoreWithoutCast>(int, at::native::(anonymous namespace)::where_kernel_impl(at::TensorIterator&, c10::ScalarType)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(bool, float, float)#1}, at::detail::Array, OffsetCalculator<3, unsigned int>, at::detail::Array<1, unsigned int>, at::native::memory::LoadWithoutCast, OffsetCalculator::StoreWithoutCast)", "pid": 0, "tid": "stream 7", + "ts": 1621401187233770, "dur": 2, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 535, "external id": 144, + "registers per thread": 26, + "shared memory": 0, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [64, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 535, "pid": 0, "tid": "stream 7", "ts": 1621401187233770, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24610", + "ts": 1621401187233751, "dur": 19, + "args": { + "cbid": 211, "correlation": 535, + "external id": 144, "external ts": 1621401187233620 + } + }, + { + "ph": "s", "id": 535, "pid": 24572, "tid": 24610, "ts": 1621401187233751, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": "stream 7", + "ts": 1621401187234156, "dur": 3, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 548, "external id": 147, + "registers per thread": 32, + "shared memory": 16, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [4, 16, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 548, "pid": 0, "tid": "stream 7", "ts": 1621401187234156, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24610", + "ts": 1621401187234135, "dur": 19, + "args": { + "cbid": 211, "correlation": 548, + "external id": 147, "external ts": 1621401187233990 + } + }, + { + "ph": "s", "id": 548, "pid": 24572, "tid": 24610, "ts": 1621401187234135, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AddFunctor, at::detail::Array >(int, at::native::AddFunctor, at::detail::Array)", "pid": 0, "tid": "stream 7", + "ts": 1621401187234445, "dur": 1, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 555, "external id": 151, + "registers per thread": 20, + "shared memory": 0, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [64, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 555, "pid": 0, "tid": "stream 7", "ts": 1621401187234445, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24610", + "ts": 1621401187234425, "dur": 19, + "args": { + "cbid": 211, "correlation": 555, + "external id": 151, "external ts": 1621401187234378 + } + }, + { + "ph": "s", "id": 555, "pid": 24572, "tid": 24610, "ts": 1621401187234425, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 24572, "tid": "24610", + "ts": 1621401187235000, "dur": 2, + "args": { + "cbid": 251, "correlation": 568, + "external id": 160, "external ts": 1621401187234890 + } + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "pid": 24572, "tid": "24610", + "ts": 1621401187235004, "dur": 0, + "args": { + "cbid": 251, "correlation": 569, + "external id": 160, "external ts": 1621401187234890 + } + }, + { + "ph": "X", "cat": "Kernel", + "name": "volta_sgemm_128x32_nt", "pid": 0, "tid": "stream 7", + "ts": 1621401187235025, "dur": 3, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 570, "external id": 160, + "registers per thread": 55, + "shared memory": 16384, + "blocks per SM": 0.0125, + "warps per SM": 0.1, + "grid": [1, 1, 1], + "block": [256, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 570, "pid": 0, "tid": "stream 7", "ts": 1621401187235025, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24610", + "ts": 1621401187235006, "dur": 17, + "args": { + "cbid": 211, "correlation": 570, + "external id": 160, "external ts": 1621401187234890 + } + }, + { + "ph": "s", "id": 570, "pid": 24572, "tid": 24610, "ts": 1621401187235006, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AddFunctor, at::detail::Array >(int, at::native::AddFunctor, at::detail::Array)", "pid": 0, "tid": "stream 7", + "ts": 1621401187235555, "dur": 1, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 579, "external id": 170, + "registers per thread": 20, + "shared memory": 0, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [64, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 579, "pid": 0, "tid": "stream 7", "ts": 1621401187235555, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24610", + "ts": 1621401187235535, "dur": 19, + "args": { + "cbid": 211, "correlation": 579, + "external id": 170, "external ts": 1621401187235487 + } + }, + { + "ph": "s", "id": 579, "pid": 24572, "tid": 24610, "ts": 1621401187235535, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AddFunctor, at::detail::Array >(int, at::native::AddFunctor, at::detail::Array)", "pid": 0, "tid": "stream 7", + "ts": 1621401187236158, "dur": 1, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 585, "external id": 176, + "registers per thread": 20, + "shared memory": 0, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [64, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 585, "pid": 0, "tid": "stream 7", "ts": 1621401187236158, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24572", + "ts": 1621401187236138, "dur": 18, + "args": { + "cbid": 211, "correlation": 585, + "external id": 176, "external ts": 1621401187236091 + } + }, + { + "ph": "s", "id": 585, "pid": 24572, "tid": 24572, "ts": 1621401187236138, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AddFunctor, at::detail::Array >(int, at::native::AddFunctor, at::detail::Array)", "pid": 0, "tid": "stream 7", + "ts": 1621401187236278, "dur": 1, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 590, "external id": 177, + "registers per thread": 20, + "shared memory": 0, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [64, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 590, "pid": 0, "tid": "stream 7", "ts": 1621401187236278, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24572", + "ts": 1621401187236261, "dur": 15, + "args": { + "cbid": 211, "correlation": 590, + "external id": 177, "external ts": 1621401187236221 + } + }, + { + "ph": "s", "id": 590, "pid": 24572, "tid": 24572, "ts": 1621401187236261, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AddFunctor, at::detail::Array >(int, at::native::AddFunctor, at::detail::Array)", "pid": 0, "tid": "stream 7", + "ts": 1621401187236390, "dur": 1, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 595, "external id": 178, + "registers per thread": 20, + "shared memory": 0, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [64, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 595, "pid": 0, "tid": "stream 7", "ts": 1621401187236390, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24572", + "ts": 1621401187236373, "dur": 15, + "args": { + "cbid": 211, "correlation": 595, + "external id": 178, "external ts": 1621401187236334 + } + }, + { + "ph": "s", "id": 595, "pid": 24572, "tid": 24572, "ts": 1621401187236373, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AddFunctor, at::detail::Array >(int, at::native::AddFunctor, at::detail::Array)", "pid": 0, "tid": "stream 7", + "ts": 1621401187236501, "dur": 1, + "args": { + "queued": 0, "device": 0, "context": 1, + "stream": 7, "correlation": 600, "external id": 179, + "registers per thread": 20, + "shared memory": 0, + "blocks per SM": 0.0125, + "warps per SM": 0.025, + "grid": [1, 1, 1], + "block": [64, 1, 1], + "theoretical occupancy %": 0 + } + }, + { + "ph": "f", "id": 600, "pid": 0, "tid": "stream 7", "ts": 1621401187236501, + "cat": "async", "name": "launch", "bp": "e" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 24572, "tid": "24572", + "ts": 1621401187236483, "dur": 15, + "args": { + "cbid": 211, "correlation": 600, + "external id": 179, "external ts": 1621401187236444 + } + }, + { + "ph": "s", "id": 600, "pid": 24572, "tid": 24572, "ts": 1621401187236483, + "cat": "async", "name": "launch" + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaDeviceSynchronize", "pid": 24572, "tid": "24572", + "ts": 1621401187236853, "dur": 10, + "args": { + "cbid": 165, "correlation": 605, + "external id": 0, "external ts": 0 + } + }, + { + "name": "process_name", "ph": "M", "ts": 1621401187223005, "pid": 24572, "tid": 0, + "args": { + "name": "python" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 1621401187223005, "pid": 24572, "tid": 0, + "args": { + "labels": "CPU" + } + }, + { + "name": "process_name", "ph": "M", "ts": 1621401187223005, "pid": 0, "tid": 0, + "args": { + "name": "python" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 1621401187223005, "pid": 0, "tid": 0, + "args": { + "labels": "GPU 0" + } + }, + { + "name": "process_name", "ph": "M", "ts": 1621401187223005, "pid": 1, "tid": 0, + "args": { + "name": "python" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 1621401187223005, "pid": 1, "tid": 0, + "args": { + "labels": "GPU 1" + } + }, + { + "name": "process_name", "ph": "M", "ts": 1621401187223005, "pid": 2, "tid": 0, + "args": { + "name": "python" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 1621401187223005, "pid": 2, "tid": 0, + "args": { + "labels": "GPU 2" + } + }, + { + "name": "process_name", "ph": "M", "ts": 1621401187223005, "pid": 3, "tid": 0, + "args": { + "name": "python" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 1621401187223005, "pid": 3, "tid": 0, + "args": { + "labels": "GPU 3" + } + }, + { + "name": "process_name", "ph": "M", "ts": 1621401187223005, "pid": 4, "tid": 0, + "args": { + "name": "python" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 1621401187223005, "pid": 4, "tid": 0, + "args": { + "labels": "GPU 4" + } + }, + { + "name": "process_name", "ph": "M", "ts": 1621401187223005, "pid": 5, "tid": 0, + "args": { + "name": "python" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 1621401187223005, "pid": 5, "tid": 0, + "args": { + "labels": "GPU 5" + } + }, + { + "name": "process_name", "ph": "M", "ts": 1621401187223005, "pid": 6, "tid": 0, + "args": { + "name": "python" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 1621401187223005, "pid": 6, "tid": 0, + "args": { + "labels": "GPU 6" + } + }, + { + "name": "process_name", "ph": "M", "ts": 1621401187223005, "pid": 7, "tid": 0, + "args": { + "name": "python" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 1621401187223005, "pid": 7, "tid": 0, + "args": { + "labels": "GPU 7" + } + }, + { + "name": "thread_name", "ph": "M", "ts": 1621401187223005, "pid": 24572, "tid": "24610", + "args": { + "name": "thread 24610 (python)" + } + }, + { + "name": "thread_name", "ph": "M", "ts": 1621401187223005, "pid": 24572, "tid": "24572", + "args": { + "name": "thread 24572 (python)" + } + }, + { + "ph": "X", "cat": "Trace", "ts": 1621401187223005, "dur": 13896, + "pid": "Traces", "tid": "PyTorch Profiler", + "name": "PyTorch Profiler (0)", + "args": { + "Op count": 0 + } + }, + { + "name": "Iteration Start: PyTorch Profiler", "ph": "i", "s": "g", + "pid": "Traces", "tid": "Trace PyTorch Profiler", "ts": 1621401187223005 + }, + { + "name": "Record Window End", "ph": "i", "s": "g", + "pid": "", "tid": "", "ts": 1621401187237108 + } +]} \ No newline at end of file diff --git a/tb_plugins/profiling/tb_plugin/test/result_check_file.txt b/tb_plugins/profiling/tb_plugin/test/result_check_file.txt new file mode 100644 index 0000000000000000000000000000000000000000..845aae2420fd3c75808a58937b0a7a794777914d --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/test/result_check_file.txt @@ -0,0 +1,10 @@ +{"steps": {"columns": [{"type": "string", "name": "Step"}, {"type": "number", "name": "Kernel"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Memcpy"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Memset"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Runtime"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "DataLoader"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "CPU Exec"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Other"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}], "rows": [["5", 98598, "
Step 5
Total: 187948us
Kernel: 98598us
Percentage: 52.46%
", 1941, "
Step 5
Total: 187948us
Memcpy: 1941us
Percentage: 1.03%
", 90, "
Step 5
Total: 187948us
Memset: 90us
Percentage: 0.05%
", 2796, "
Step 5
Total: 187948us
Runtime: 2796us
Percentage: 1.49%
", 69317, "
Step 5
Total: 187948us
DataLoader: 69317us
Percentage: 36.88%
", 14091, "
Step 5
Total: 187948us
CPU Exec: 14091us
Percentage: 7.5%
", 1115, "
Step 5
Total: 187948us
Other: 1115us
Percentage: 0.59%
"], ["6", 98570, "
Step 6
Total: 175153us
Kernel: 98570us
Percentage: 56.28%
", 1947, "
Step 6
Total: 175153us
Memcpy: 1947us
Percentage: 1.11%
", 89, "
Step 6
Total: 175153us
Memset: 89us
Percentage: 0.05%
", 2762, "
Step 6
Total: 175153us
Runtime: 2762us
Percentage: 1.58%
", 57669, "
Step 6
Total: 175153us
DataLoader: 57669us
Percentage: 32.92%
", 12968, "
Step 6
Total: 175153us
CPU Exec: 12968us
Percentage: 7.4%
", 1148, "
Step 6
Total: 175153us
Other: 1148us
Percentage: 0.66%
"], ["7", 98596, "
Step 7
Total: 179733us
Kernel: 98596us
Percentage: 54.86%
", 1931, "
Step 7
Total: 179733us
Memcpy: 1931us
Percentage: 1.07%
", 91, "
Step 7
Total: 179733us
Memset: 91us
Percentage: 0.05%
", 2877, "
Step 7
Total: 179733us
Runtime: 2877us
Percentage: 1.6%
", 61257, "
Step 7
Total: 179733us
DataLoader: 61257us
Percentage: 34.08%
", 13768, "
Step 7
Total: 179733us
CPU Exec: 13768us
Percentage: 7.66%
", 1213, "
Step 7
Total: 179733us
Other: 1213us
Percentage: 0.67%
"], ["8", 98623, "
Step 8
Total: 174564us
Kernel: 98623us
Percentage: 56.5%
", 1938, "
Step 8
Total: 174564us
Memcpy: 1938us
Percentage: 1.11%
", 89, "
Step 8
Total: 174564us
Memset: 89us
Percentage: 0.05%
", 2841, "
Step 8
Total: 174564us
Runtime: 2841us
Percentage: 1.63%
", 56453, "
Step 8
Total: 174564us
DataLoader: 56453us
Percentage: 32.34%
", 13420, "
Step 8
Total: 174564us
CPU Exec: 13420us
Percentage: 7.69%
", 1200, "
Step 8
Total: 174564us
Other: 1200us
Percentage: 0.69%
"], ["9", 98504, "
Step 9
Total: 182172us
Kernel: 98504us
Percentage: 54.07%
", 1937, "
Step 9
Total: 182172us
Memcpy: 1937us
Percentage: 1.06%
", 87, "
Step 9
Total: 182172us
Memset: 87us
Percentage: 0.05%
", 2788, "
Step 9
Total: 182172us
Runtime: 2788us
Percentage: 1.53%
", 62690, "
Step 9
Total: 182172us
DataLoader: 62690us
Percentage: 34.41%
", 15025, "
Step 9
Total: 182172us
CPU Exec: 15025us
Percentage: 8.25%
", 1141, "
Step 9
Total: 182172us
Other: 1141us
Percentage: 0.63%
"], ["10", 98641, "
Step 10
Total: 165983us
Kernel: 98641us
Percentage: 59.43%
", 1798, "
Step 10
Total: 165983us
Memcpy: 1798us
Percentage: 1.08%
", 88, "
Step 10
Total: 165983us
Memset: 88us
Percentage: 0.05%
", 3381, "
Step 10
Total: 165983us
Runtime: 3381us
Percentage: 2.04%
", 48185, "
Step 10
Total: 165983us
DataLoader: 48185us
Percentage: 29.03%
", 12773, "
Step 10
Total: 165983us
CPU Exec: 12773us
Percentage: 7.7%
", 1117, "
Step 10
Total: 165983us
Other: 1117us
Percentage: 0.67%
"]]}, "performance": [{"name": "Average Step Time", "description": "", "value": 177592, "extra": 100, "children": [{"name": "Kernel", "description": "", "value": 98589, "extra": 55.51}, {"name": "Memcpy", "description": "", "value": 1915, "extra": 1.08}, {"name": "Memset", "description": "", "value": 89, "extra": 0.05}, {"name": "Runtime", "description": "", "value": 2908, "extra": 1.64}, {"name": "DataLoader", "description": "", "value": 59262, "extra": 33.37}, {"name": "CPU Exec", "description": "", "value": 13674, "extra": 7.7}, {"name": "Other", "description": "", "value": 1156, "extra": 0.65}]}], "recommendations": "
  • This run has high time cost on input data loading. 33.4% of the step time is in DataLoader. You could try to set num_workers on DataLoader's construction and enable multi-processes on data loading.
  • Kernels with 68% time are launched by Tensor Cores eligible operators. You could enable Automatic Mixed Precision to speedup by using FP16.
", "environments": [{"title": "Number of Worker(s)", "value": "1"}, {"title": "Device Type", "value": "GPU"}], "gpu_metrics": {"title": "GPU Summary", "data": [{"title": "GPU 0:", "value": ""}, {"title": "Name", "value": "Tesla V100-DGXS-32GB"}, {"title": "Memory", "value": "31.74 GB"}, {"title": "Compute Capability", "value": "7.0"}, {"title": "GPU Utilization", "value": "55.51 %"}, {"title": "Est. SM Efficiency", "value": "54.68 %"}, {"title": "Est. Achieved Occupancy", "value": "49.13 %"}, {"title": "Kernel Time using Tensor Cores", "value": "0.0 %"}], "tooltip": "The GPU usage metrics:\n\nGPU Utilization:\nGPU busy time / All steps time. The higher, the better. GPU busy time is the time during which there is at least one GPU kernel running on it. All steps time is the total time of all profiler steps(or called as iterations).\n\nEst. SM Efficiency:\nEstimated Stream Multiprocessor Efficiency. The higher, the better. This metric of a kernel, SM_Eff_K = min(blocks of this kernel / SM number of this GPU, 100%). This overall number is the sum of all kernels' SM_Eff_K weighted by kernel's execution duration, divided by all steps time.\n\nEst. Achieved Occupancy:\nFor most cases such as memory bandwidth bounded kernels, the higher the better. Occupancy is the ratio of active warps on an SM to the maximum number of active warps supported by the SM. The theoretical occupancy of a kernel is upper limit occupancy of this kernel, limited by multiple factors such as kernel shape, kernel used resource, and the GPU compute capability.\nEst. Achieved Occupancy of a kernel, OCC_K = min(threads of the kernel / SM number / max threads per SM, theoretical occupancy of the kernel). This overall number is the weighted average of all kernels' OCC_K using kernel's execution duration as weight. It shows fine-grained low-level GPU utilization.\n\nKernel using Tensor Cores:\nTotal GPU Time for Tensor Core kernels / Total GPU Time for all kernels.\n"}} +{"device_total_time": {"title": "Device Total Time (us)", "columns": [{"type": "string", "name": "name"}, {"type": "number", "name": "value"}], "rows": [["aten::cudnn_convolution_backward", 273428], ["CudnnConvolutionBackward", 273428], ["aten::cudnn_convolution_backward_weight", 142461], ["aten::cudnn_convolution_backward_input", 130967], ["aten::cudnn_convolution", 126619], ["aten::_convolution", 126619], ["aten::convolution", 126619], ["aten::conv2d", 126619], ["aten::cudnn_batch_norm_backward", 61939], ["CudnnBatchNormBackward", 61939], ["aten::cudnn_batch_norm", 34245], ["aten::_batch_norm_impl_index", 34245], ["aten::batch_norm", 34245], ["aten::threshold_backward", 27298], ["ReluBackward1", 27298], ["aten::add_", 24098], ["aten::clamp_min", 17860], ["aten::clamp_min_", 17860], ["aten::relu_", 17860], ["aten::add", 16038], ["aten::copy_", 11492], ["aten::to", 11492], ["aten::max_pool2d_with_indices_backward", 4677], ["MaxPool2DWithIndicesBackward", 4677], ["torch::autograd::AccumulateGrad", 3030], ["aten::mul_", 2409], ["aten::fill_", 1887], ["aten::zero_", 1881], ["aten::max_pool2d_with_indices", 1420], ["aten::max_pool2d", 1420], ["aten::mm", 275], ["AddmmBackward", 275], ["aten::mean", 212], ["aten::adaptive_avg_pool2d", 212], ["aten::addmm", 197], ["aten::linear", 197], ["aten::div", 144], ["MeanBackward1", 144], ["aten::cross_entropy_loss", 60], ["aten::_log_softmax_backward_data", 53], ["LogSoftmaxBackward", 53], ["aten::sum", 44], ["aten::_log_softmax", 42], ["aten::log_softmax", 42], ["aten::nll_loss_forward", 18], ["aten::nll_loss", 18], ["aten::nll_loss_nd", 18], ["aten::nll_loss_backward", 18], ["NllLossBackward", 18], ["aten::ones_like", 6]]}, "device_self_time": {"title": "Device Self Time (us)", "columns": [{"type": "string", "name": "name"}, {"type": "number", "name": "value"}], "rows": [["aten::cudnn_convolution_backward_weight", 142461], ["aten::cudnn_convolution_backward_input", 130967], ["aten::cudnn_convolution", 126619], ["aten::cudnn_batch_norm_backward", 61939], ["aten::cudnn_batch_norm", 34245], ["aten::threshold_backward", 27298], ["aten::add_", 24098], ["aten::clamp_min", 17860], ["aten::add", 16038], ["aten::copy_", 11492], ["aten::max_pool2d_with_indices_backward", 3822], ["aten::mul_", 2409], ["aten::fill_", 1887], ["aten::max_pool2d_with_indices", 1420], ["aten::mm", 275], ["aten::mean", 212], ["aten::addmm", 197], ["aten::div", 144], ["aten::_log_softmax_backward_data", 53], ["aten::sum", 44], ["aten::_log_softmax", 42], ["aten::nll_loss_forward", 18], ["aten::nll_loss_backward", 18]]}, "host_total_time": {"title": "Host Total Time (us)", "columns": [{"type": "string", "name": "name"}, {"type": "number", "name": "value"}], "rows": [["CudnnConvolutionBackward", 90989], ["aten::batch_norm", 87977], ["aten::cudnn_convolution_backward", 87772], ["aten::add_", 78125], ["aten::_batch_norm_impl_index", 78071], ["aten::conv2d", 77781], ["aten::cudnn_batch_norm", 71527], ["aten::convolution", 70394], ["aten::empty", 68147], ["aten::to", 64332], ["aten::_convolution", 64243], ["aten::cudnn_convolution", 56998], ["aten::copy_", 52853], ["aten::cudnn_convolution_backward_input", 41445], ["aten::cudnn_convolution_backward_weight", 40246], ["aten::div", 35158], ["CudnnBatchNormBackward", 34608], ["aten::contiguous", 31137], ["aten::cudnn_batch_norm_backward", 30460], ["aten::mul_", 29081], ["torch::autograd::AccumulateGrad", 28494], ["aten::zero_", 27597], ["aten::empty_like", 26064], ["aten::stack", 24346], ["aten::relu_", 24181], ["aten::add", 19289], ["aten::cat", 17085], ["aten::fill_", 17059], ["aten::_cat", 16933], ["aten::clamp_min_", 15665], ["aten::view", 14027], ["aten::resize_", 12406], ["aten::empty_strided", 11829], ["ReluBackward1", 11656], ["aten::clamp_min", 10311], ["aten::permute", 9775], ["aten::threshold_backward", 9482], ["aten::as_strided", 7600], ["aten::unsqueeze", 6603], ["aten::linear", 1408], ["AddmmBackward", 1303], ["aten::cross_entropy_loss", 1180], ["aten::zeros", 1105], ["aten::addmm", 1034], ["MeanBackward1", 987], ["aten::mm", 860], ["NllLossBackward", 716], ["aten::max_pool2d", 687], ["aten::nll_loss_backward", 614], ["aten::t", 584], ["aten::log_softmax", 567], ["aten::max_pool2d_with_indices", 562], ["aten::adaptive_avg_pool2d", 561], ["aten::nll_loss_nd", 495], ["MaxPool2DWithIndicesBackward", 484], ["aten::ones_like", 452], ["aten::mean", 445], ["aten::_log_softmax", 433], ["aten::nll_loss", 414], ["aten::max_pool2d_with_indices_backward", 411], ["LogSoftmaxBackward", 359], ["aten::narrow", 350], ["aten::nll_loss_forward", 346], ["aten::transpose", 329], ["aten::sum", 327], ["aten::_log_softmax_backward_data", 306], ["aten::expand", 229], ["aten::slice", 223], ["aten::detach_", 208], ["AddBackward0", 175], ["aten::flatten", 164], ["TBackward", 103], ["detach_", 100], ["ViewBackward", 80], ["aten::reshape", 55], ["aten::conj", 12]]}, "host_self_time": {"title": "Host Self Time (us)", "columns": [{"type": "string", "name": "name"}, {"type": "number", "name": "value"}], "rows": [["aten::empty", 68147], ["aten::add_", 51013], ["aten::copy_", 40255], ["aten::cudnn_convolution", 33121], ["aten::cudnn_convolution_backward_input", 29324], ["aten::cudnn_convolution_backward_weight", 22804], ["aten::mul_", 20515], ["aten::div", 20135], ["aten::cudnn_batch_norm", 19843], ["aten::_cat", 16282], ["aten::to", 14834], ["aten::add", 14329], ["aten::view", 14027], ["aten::resize_", 12406], ["aten::cudnn_batch_norm_backward", 12238], ["aten::empty_strided", 11829], ["aten::empty_like", 11742], ["aten::zero_", 10693], ["aten::batch_norm", 9906], ["aten::fill_", 9879], ["aten::relu_", 8516], ["aten::as_strided", 7600], ["aten::conv2d", 7387], ["aten::_convolution", 7245], ["aten::clamp_min", 7106], ["aten::_batch_norm_impl_index", 6544], ["aten::convolution", 6151], ["aten::threshold_backward", 6090], ["aten::cudnn_convolution_backward", 6081], ["aten::permute", 5515], ["aten::contiguous", 5510], ["torch::autograd::AccumulateGrad", 5457], ["aten::clamp_min_", 5354], ["CudnnBatchNormBackward", 4148], ["aten::unsqueeze", 3574], ["CudnnConvolutionBackward", 3217], ["ReluBackward1", 2174], ["aten::zeros", 659], ["aten::stack", 658], ["aten::addmm", 639], ["aten::mm", 575], ["MeanBackward1", 541], ["aten::max_pool2d_with_indices", 477], ["aten::nll_loss_backward", 388], ["aten::nll_loss_forward", 266], ["aten::t", 255], ["aten::mean", 234], ["aten::transpose", 197], ["AddmmBackward", 182], ["aten::max_pool2d_with_indices_backward", 176], ["AddBackward0", 175], ["aten::_log_softmax", 170], ["aten::sum", 153], ["aten::cat", 152], ["aten::expand", 150], ["aten::narrow", 127], ["aten::max_pool2d", 125], ["aten::linear", 124], ["aten::slice", 123], ["aten::cross_entropy_loss", 118], ["aten::adaptive_avg_pool2d", 116], ["aten::detach_", 108], ["aten::_log_softmax_backward_data", 108], ["NllLossBackward", 102], ["detach_", 100], ["aten::ones_like", 95], ["aten::log_softmax", 90], ["aten::flatten", 84], ["aten::nll_loss_nd", 81], ["MaxPool2DWithIndicesBackward", 73], ["aten::nll_loss", 68], ["LogSoftmaxBackward", 53], ["aten::reshape", 29], ["ViewBackward", 25], ["TBackward", 18], ["aten::conj", 12]]}} +{"metadata": {"sort": "device_self_duration", "tooltips": {"tc_eligible": "Whether this operator is eligible to use Tensor Cores.", "tc_self_ratio": "Time of self-kernels with Tensor Cores / Time of self-kernels.", "tc_total_ratio": "Time of kernels with Tensor Cores / Time of kernels."}}, "data": [{"name": "aten::cudnn_convolution_backward_weight", "calls": 318, "device_self_duration": 142461, "device_total_duration": 142461, "host_self_duration": 22804, "host_total_duration": 40246, "tc_eligible": "Yes", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::cudnn_convolution_backward_input", "calls": 312, "device_self_duration": 130967, "device_total_duration": 130967, "host_self_duration": 29324, "host_total_duration": 41445, "tc_eligible": "Yes", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::cudnn_convolution", "calls": 318, "device_self_duration": 126619, "device_total_duration": 126619, "host_self_duration": 33121, "host_total_duration": 56998, "tc_eligible": "Yes", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::cudnn_batch_norm_backward", "calls": 318, "device_self_duration": 61939, "device_total_duration": 61939, "host_self_duration": 12238, "host_total_duration": 30460, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::cudnn_batch_norm", "calls": 318, "device_self_duration": 34245, "device_total_duration": 34245, "host_self_duration": 19843, "host_total_duration": 71527, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::threshold_backward", "calls": 294, "device_self_duration": 27298, "device_total_duration": 27298, "host_self_duration": 6090, "host_total_duration": 9482, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::add_", "calls": 2994, "device_self_duration": 24098, "device_total_duration": 24098, "host_self_duration": 51013, "host_total_duration": 78125, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::clamp_min", "calls": 294, "device_self_duration": 17860, "device_total_duration": 17860, "host_self_duration": 7106, "host_total_duration": 10311, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::add", "calls": 414, "device_self_duration": 16038, "device_total_duration": 16038, "host_self_duration": 14329, "host_total_duration": 19289, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::copy_", "calls": 588, "device_self_duration": 11492, "device_total_duration": 11492, "host_self_duration": 40255, "host_total_duration": 52853, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::max_pool2d_with_indices_backward", "calls": 6, "device_self_duration": 3822, "device_total_duration": 4677, "host_self_duration": 176, "host_total_duration": 411, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::mul_", "calls": 966, "device_self_duration": 2409, "device_total_duration": 2409, "host_self_duration": 20515, "host_total_duration": 29081, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::fill_", "calls": 978, "device_self_duration": 1887, "device_total_duration": 1887, "host_self_duration": 9879, "host_total_duration": 17059, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::max_pool2d_with_indices", "calls": 6, "device_self_duration": 1420, "device_total_duration": 1420, "host_self_duration": 477, "host_total_duration": 562, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::mm", "calls": 12, "device_self_duration": 275, "device_total_duration": 275, "host_self_duration": 575, "host_total_duration": 860, "tc_eligible": "Yes", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::mean", "calls": 6, "device_self_duration": 212, "device_total_duration": 212, "host_self_duration": 234, "host_total_duration": 445, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::addmm", "calls": 6, "device_self_duration": 197, "device_total_duration": 197, "host_self_duration": 639, "host_total_duration": 1034, "tc_eligible": "Yes", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::div", "calls": 198, "device_self_duration": 144, "device_total_duration": 144, "host_self_duration": 20135, "host_total_duration": 35158, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::_log_softmax_backward_data", "calls": 6, "device_self_duration": 53, "device_total_duration": 53, "host_self_duration": 108, "host_total_duration": 306, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::sum", "calls": 6, "device_self_duration": 44, "device_total_duration": 44, "host_self_duration": 153, "host_total_duration": 327, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::_log_softmax", "calls": 6, "device_self_duration": 42, "device_total_duration": 42, "host_self_duration": 170, "host_total_duration": 433, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::nll_loss_forward", "calls": 6, "device_self_duration": 18, "device_total_duration": 18, "host_self_duration": 266, "host_total_duration": 346, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::nll_loss_backward", "calls": 6, "device_self_duration": 18, "device_total_duration": 18, "host_self_duration": 388, "host_total_duration": 614, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::empty", "calls": 4404, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 68147, "host_total_duration": 68147, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::zero_", "calls": 996, "device_self_duration": 0, "device_total_duration": 1881, "host_self_duration": 10693, "host_total_duration": 27597, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::zeros", "calls": 24, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 659, "host_total_duration": 1105, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::view", "calls": 846, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 14027, "host_total_duration": 14027, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::as_strided", "calls": 432, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 7600, "host_total_duration": 7600, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::permute", "calls": 192, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 5515, "host_total_duration": 9775, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::empty_like", "calls": 528, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 11742, "host_total_duration": 26064, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::contiguous", "calls": 192, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 5510, "host_total_duration": 31137, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::empty_strided", "calls": 402, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 11829, "host_total_duration": 11829, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::to", "calls": 414, "device_self_duration": 0, "device_total_duration": 11492, "host_self_duration": 14834, "host_total_duration": 64332, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::unsqueeze", "calls": 192, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 3574, "host_total_duration": 6603, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::resize_", "calls": 1902, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 12406, "host_total_duration": 12406, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::slice", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 123, "host_total_duration": 223, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::narrow", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 127, "host_total_duration": 350, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::_cat", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 16282, "host_total_duration": 16933, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::cat", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 152, "host_total_duration": 17085, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::stack", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 658, "host_total_duration": 24346, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "detach_", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 100, "host_total_duration": 100, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::detach_", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 108, "host_total_duration": 208, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::_convolution", "calls": 318, "device_self_duration": 0, "device_total_duration": 126619, "host_self_duration": 7245, "host_total_duration": 64243, "tc_eligible": "Yes", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::convolution", "calls": 318, "device_self_duration": 0, "device_total_duration": 126619, "host_self_duration": 6151, "host_total_duration": 70394, "tc_eligible": "Yes", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::conv2d", "calls": 318, "device_self_duration": 0, "device_total_duration": 126619, "host_self_duration": 7387, "host_total_duration": 77781, "tc_eligible": "Yes", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::_batch_norm_impl_index", "calls": 318, "device_self_duration": 0, "device_total_duration": 34245, "host_self_duration": 6544, "host_total_duration": 78071, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::batch_norm", "calls": 318, "device_self_duration": 0, "device_total_duration": 34245, "host_self_duration": 9906, "host_total_duration": 87977, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::clamp_min_", "calls": 294, "device_self_duration": 0, "device_total_duration": 17860, "host_self_duration": 5354, "host_total_duration": 15665, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::relu_", "calls": 294, "device_self_duration": 0, "device_total_duration": 17860, "host_self_duration": 8516, "host_total_duration": 24181, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::max_pool2d", "calls": 6, "device_self_duration": 0, "device_total_duration": 1420, "host_self_duration": 125, "host_total_duration": 687, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::adaptive_avg_pool2d", "calls": 6, "device_self_duration": 0, "device_total_duration": 212, "host_self_duration": 116, "host_total_duration": 561, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::flatten", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 84, "host_total_duration": 164, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::transpose", "calls": 30, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 197, "host_total_duration": 329, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::t", "calls": 30, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 255, "host_total_duration": 584, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::expand", "calls": 12, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 150, "host_total_duration": 229, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::linear", "calls": 6, "device_self_duration": 0, "device_total_duration": 197, "host_self_duration": 124, "host_total_duration": 1408, "tc_eligible": "Yes", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::log_softmax", "calls": 6, "device_self_duration": 0, "device_total_duration": 42, "host_self_duration": 90, "host_total_duration": 567, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::nll_loss", "calls": 6, "device_self_duration": 0, "device_total_duration": 18, "host_self_duration": 68, "host_total_duration": 414, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::nll_loss_nd", "calls": 6, "device_self_duration": 0, "device_total_duration": 18, "host_self_duration": 81, "host_total_duration": 495, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::cross_entropy_loss", "calls": 6, "device_self_duration": 0, "device_total_duration": 60, "host_self_duration": 118, "host_total_duration": 1180, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::ones_like", "calls": 6, "device_self_duration": 0, "device_total_duration": 6, "host_self_duration": 95, "host_total_duration": 452, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "NllLossBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 18, "host_self_duration": 102, "host_total_duration": 716, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "LogSoftmaxBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 53, "host_self_duration": 53, "host_total_duration": 359, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::conj", "calls": 12, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 12, "host_total_duration": 12, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": false}, {"name": "AddmmBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 275, "host_self_duration": 182, "host_total_duration": 1303, "tc_eligible": "Yes", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "torch::autograd::AccumulateGrad", "calls": 966, "device_self_duration": 0, "device_total_duration": 3030, "host_self_duration": 5457, "host_total_duration": 28494, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "TBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 18, "host_total_duration": 103, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": false}, {"name": "aten::reshape", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 29, "host_total_duration": 55, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": false}, {"name": "ViewBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 25, "host_total_duration": 80, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": false}, {"name": "MeanBackward1", "calls": 6, "device_self_duration": 0, "device_total_duration": 144, "host_self_duration": 541, "host_total_duration": 987, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "ReluBackward1", "calls": 294, "device_self_duration": 0, "device_total_duration": 27298, "host_self_duration": 2174, "host_total_duration": 11656, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "AddBackward0", "calls": 96, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 175, "host_total_duration": 175, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": false}, {"name": "CudnnBatchNormBackward", "calls": 318, "device_self_duration": 0, "device_total_duration": 61939, "host_self_duration": 4148, "host_total_duration": 34608, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::cudnn_convolution_backward", "calls": 318, "device_self_duration": 0, "device_total_duration": 273428, "host_self_duration": 6081, "host_total_duration": 87772, "tc_eligible": "Yes", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "CudnnConvolutionBackward", "calls": 318, "device_self_duration": 0, "device_total_duration": 273428, "host_self_duration": 3217, "host_total_duration": 90989, "tc_eligible": "Yes", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "MaxPool2DWithIndicesBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 4677, "host_self_duration": 73, "host_total_duration": 484, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": false}]} +{"metadata": {"sort": "Total Duration (us)"}, "data": {"columns": [{"type": "string", "name": "Name"}, {"type": "string", "name": "Tensor Cores Used", "tooltip": "Whether this kernel uses Tensor Cores."}, {"type": "number", "name": "Calls"}, {"type": "number", "name": "Total Duration (us)"}, {"type": "number", "name": "Mean Duration (us)"}, {"type": "number", "name": "Max Duration (us)"}, {"type": "number", "name": "Min Duration (us)"}, {"type": "number", "name": "Mean Blocks Per SM", "tooltip": "Blocks Per SM = blocks of this kernel / SM number of this GPU.\nIf this number is less than 1, it indicates the GPU multiprocessors are not fully utilized.\n\"Mean Blocks per SM\" is the weighted average of all calls of this kernel, using each call's execution duration as weight."}, {"type": "number", "name": "Mean Est. Achieved Occupancy (%)", "tooltip": "Est. Achieved Occupancy:\nFor most cases such as memory bandwidth bounded kernels, the higher the better. Occupancy is the ratio of active warps on an SM to the maximum number of active warps supported by the SM. The theoretical occupancy of a kernel is upper limit occupancy of this kernel, limited by multiple factors such as kernel shape, kernel used resource, and the GPU compute capability.\nEst. Achieved Occupancy of a kernel, OCC_K = min(threads of the kernel / SM number / max threads per SM, theoretical occupancy of the kernel). This \"Mean\" number is the weighted average of all calls' OCC_K of the kernel, using each call's execution duration as weight. It shows fine-grained low-level GPU utilization."}], "rows": [["void cudnn::detail::dgrad_engine(int, int, int, float const*, int, float const*, int, float*, kernel_grad_params, unsigned long long, int, unsigned long long, int, float, int, int, int)", "No", 162, 80756, 498, 1017, 323, 42.25, 29.97], ["void cudnn::cnn::wgrad_alg0_engine(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", "No", 156, 66472, 426, 745, 345, 9.78, 38.0], ["void cudnn::bn_bw_1C11_kernel_new(float, float, float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float*, float*, float const*, float const*, float)", "No", 264, 59642, 226, 915, 45, 4.34, 67.98], ["void at::native::vectorized_elementwise_kernel<4, at::native::AddFunctor, at::detail::Array >(int, at::native::AddFunctor, at::detail::Array)", "No", 3090, 39814, 13, 378, 1, 641.54, 92.32], ["void implicit_convolve_sgemm(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", "No", 90, 36957, 411, 748, 347, 12.34, 50.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::threshold_kernel_impl(at::TensorIteratorBase&, float, float)::{lambda(float, float)#1}, at::detail::Array >(int, at::native::threshold_kernel_impl(at::TensorIteratorBase&, float, float)::{lambda(float, float)#1}, at::detail::Array)", "No", 294, 27298, 93, 377, 13, 653.06, 100.0], ["void cudnn::bn_fw_tr_1C11_kernel_NCHW(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float const*, float, float, float*, float*, float*, float*, float, float)", "No", 150, 27060, 180, 452, 53, 3.12, 64.06], ["void implicit_convolve_sgemm(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", "No", 60, 25782, 430, 729, 352, 3.9, 42.09], ["volta_sgemm_64x64_nt", "No", 102, 21084, 207, 279, 184, 10.24, 19.38], ["volta_scudnn_128x128_stridedB_splitK_small_nn_v1", "No", 48, 20448, 426, 676, 307, 6.83, 25.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::clamp_min_scalar_kernel_impl(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::clamp_min_scalar_kernel_impl(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array)", "No", 294, 17860, 61, 252, 5, 666.65, 100.0], ["volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1", "No", 36, 12704, 353, 362, 344, 22.4, 25.0], ["volta_scudnn_128x64_stridedB_interior_nn_v1", "No", 30, 9597, 320, 510, 252, 12.9, 19.0], ["volta_sgemm_128x32_nt", "No", 24, 8629, 360, 477, 18, 0.97, 11.51], ["volta_sgemm_64x64_nn", "No", 42, 8551, 204, 217, 195, 12.34, 24.14], ["volta_scudnn_128x64_relu_interior_nn_v1", "No", 30, 8022, 267, 316, 94, 37.1, 25.0], ["volta_scudnn_128x64_stridedB_splitK_xregs_large_nn_v1", "No", 12, 7817, 651, 671, 635, 15.96, 19.0], ["void cudnn::bn_fw_tr_1C11_singleread(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float const*, float, float, float*, float*, float*, float*, float, float, cudnn::reduced_divisor, int, cudnn::reduced_divisor, cudnn::bnFwPersistentState*, int, float, float, float, int, float, float, cudnnStatus_t*, bool)", "No", 168, 7185, 43, 89, 13, 12.57, 75.0], ["void cudnn::cnn::wgrad_alg0_engine(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", "No", 12, 7068, 589, 987, 193, 85.34, 37.5], ["void cudnn::winograd_nonfused::winogradForwardOutput4x4(cudnn::winograd_nonfused::WinogradOutputParams)", "No", 120, 5369, 45, 73, 19, 10.0, 50.0], ["void implicit_convolve_sgemm(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", "No", 12, 5219, 435, 437, 432, 9.8, 31.0], ["void explicit_convolve_sgemm(int, int, int, float const*, int, float const*, int, float*, kernel_conv_params, unsigned long long, int, unsigned long long, int, float, float, int, float const*, float const*)", "No", 6, 4759, 793, 796, 790, 9.8, 31.0], ["void cudnn::winograd_nonfused::winogradForwardData4x4(cudnn::winograd_nonfused::WinogradDataParams)", "No", 120, 4710, 39, 66, 17, 10.11, 50.0], ["volta_scudnn_128x128_stridedB_interior_nn_v1", "No", 18, 4693, 261, 281, 252, 9.8, 25.0], ["void cudnn::winograd_nonfused::winogradWgradData4x4(cudnn::winograd_nonfused::WinogradDataParams)", "No", 78, 4692, 60, 126, 20, 15.46, 38.0], ["void cudnn::ops::scalePackedTensor_kernel(cudnnTensor4dStruct, float*, float)", "No", 162, 4631, 29, 143, 5, 496.39, 100.0], ["void cudnn::winograd_nonfused::winogradWgradDelta4x4(cudnn::winograd_nonfused::WinogradDeltaParams)", "No", 78, 4573, 59, 125, 17, 15.69, 50.0], ["void cudnn::cnn::wgrad_alg0_engine(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", "No", 6, 4065, 678, 692, 652, 6.4, 25.0], ["void implicit_convolve_sgemm(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", "No", 6, 3917, 653, 686, 595, 4.9, 25.0], ["void at::native::(anonymous namespace)::max_pool_backward_nchw(int, float const*, long const*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, float*)", "No", 6, 3822, 637, 638, 636, 1254.4, 100.0], ["volta_scudnn_128x128_stridedB_splitK_medium_nn_v1", "No", 6, 3720, 620, 623, 614, 5.6, 25.0], ["volta_scudnn_128x64_relu_medium_nn_v1", "No", 6, 3627, 604, 606, 603, 39.2, 25.0], ["volta_scudnn_128x128_stridedB_medium_nn_v1", "No", 12, 3501, 292, 296, 286, 19.6, 25.0], ["volta_scudnn_128x32_sliced1x4_ldg4_relu_exp_medium_nhwc_tn_v1", "No", 6, 3270, 545, 627, 526, 4.9, 25.0], ["volta_scudnn_128x64_relu_small_nn_v1", "No", 12, 3265, 272, 279, 254, 9.8, 25.0], ["volta_scudnn_128x64_relu_xregs_large_nn_v1", "No", 6, 3200, 533, 607, 516, 4.9, 19.0], ["volta_sgemm_32x128_nn", "No", 18, 3053, 170, 171, 168, 22.05, 50.0], ["volta_scudnn_128x128_relu_interior_nn_v1", "No", 6, 3010, 502, 508, 495, 9.8, 25.0], ["volta_scudnn_128x128_stridedB_small_nn_v1", "No", 6, 2995, 499, 505, 493, 19.6, 25.0], ["volta_sgemm_32x128_nt", "No", 18, 2843, 158, 159, 156, 22.05, 50.0], ["void cudnn::winograd_nonfused::winogradForwardFilter4x4(cudnn::winograd_nonfused::WinogradFilterParams)", "No", 120, 2662, 22, 67, 5, 8.68, 73.22], ["void at::native::vectorized_elementwise_kernel<4, at::native::MulScalarFunctor, at::detail::Array >(int, at::native::MulScalarFunctor, at::detail::Array)", "No", 966, 2409, 2, 25, 1, 43.72, 58.39], ["void cudnn::bn_bw_1C11_singleread(float, float, float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float*, float*, float const*, float const*, float, cudnn::reduced_divisor, int, cudnn::reduced_divisor, cudnn::bnBwPersistentState*, int, float, float, float, int, float, cudnnStatus_t*, bool)", "No", 54, 2297, 43, 73, 18, 20.81, 75.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "No", 978, 1887, 2, 143, 0, 599.07, 86.78], ["void cudnn::winograd_nonfused::winogradWgradOutput4x4(cudnn::winograd_nonfused::WinogradWgradOutputParams)", "No", 78, 1504, 19, 69, 5, 8.06, 41.33], ["void at::native::(anonymous namespace)::max_pool_forward_nchw(int, float const*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, float*, long*)", "No", 6, 1420, 237, 239, 234, 313.6, 100.0], ["void cudnn::cnn::im2col4d_kernel(cudnn::cnn::im2col4d_params, cudnnConvolutionStruct, cudnnTensor4dStruct, float const*, float*)", "No", 6, 614, 102, 103, 101, 0.95, 24.0], ["volta_scudnn_128x64_stridedB_small_nn_v1", "No", 6, 584, 97, 100, 93, 9.8, 19.0], ["void nchwToNhwcKernel(int, int, int, int, float const*, float*, float, float)", "No", 12, 453, 38, 68, 9, 73.28, 100.0], ["cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "No", 138, 342, 2, 4, 1, 0.13, 1.73], ["void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "No", 318, 322, 1, 2, 1, 0.01, 0.0], ["void at::native::reduce_kernel<512, 1, at::native::ReduceOp, unsigned int, float, 4> >(at::native::ReduceOp, unsigned int, float, 4>)", "No", 6, 212, 35, 36, 35, 51.2, 100.0], ["volta_sgemm_64x32_sliced1x4_nn", "No", 6, 150, 25, 26, 24, 2.0, 25.0], ["volta_sgemm_64x32_sliced1x4_tn", "No", 6, 149, 25, 26, 24, 1.0, 13.0], ["void at::native::unrolled_elementwise_kernel, at::detail::Array, OffsetCalculator<1, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, at::native::MulScalarFunctor, at::detail::Array, OffsetCalculator<1, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast)", "No", 6, 144, 24, 24, 24, 156.8, 100.0], ["void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams)", "No", 36, 134, 4, 5, 2, 0.4, 3.0], ["void nhwcToNchwKernel(int, int, int, int, float const*, float*, float, float)", "No", 6, 105, 18, 18, 17, 22.4, 100.0], ["cask_cudnn::computeWgradSplitKOffsetsKernel(cask_cudnn::ComputeSplitKOffsetsParams)", "No", 66, 81, 1, 2, 1, 0.15, 1.68], ["cask_cudnn::computeWgradBOffsetsKernel(cask_cudnn::ComputeWgradBOffsetsParams)", "No", 66, 81, 1, 2, 1, 0.02, 0.0], ["cask_cudnn::computeBOffsetsKernel(cask_cudnn::ComputeBOffsetsParams)", "No", 72, 73, 1, 2, 1, 0.02, 0.0], ["void (anonymous namespace)::softmax_warp_backward(float*, float const*, float const*, int, int, int)", "No", 6, 53, 9, 9, 8, 0.1, 1.0], ["void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "No", 6, 44, 7, 8, 7, 0.03, 0.0], ["void (anonymous namespace)::softmax_warp_forward(float*, float const*, int, int, int)", "No", 6, 42, 7, 7, 7, 0.1, 1.0], ["void splitKreduce_kernel(cublasSplitKParams, float const*, float const*, float*, float const*, float const*, float const*)", "No", 12, 30, 2, 3, 2, 4.44, 28.0], ["void at::native::unrolled_elementwise_kernel, OffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithoutCast, at::detail::Array::StoreWithoutCast>(int, at::native::copy_device_to_device(at::TensorIterator&, bool)::{lambda()#2}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array, OffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithoutCast, at::detail::Array::StoreWithoutCast)", "No", 6, 30, 5, 5, 5, 1.56, 5.0], ["void cunn_ClassNLLCriterion_updateOutput_kernel(float*, float*, float*, long*, float*, int, int, int, int, long)", "No", 6, 18, 3, 3, 3, 0.01, 0.0], ["void cunn_ClassNLLCriterion_updateGradInput_kernel(float*, float*, long*, float*, float*, int, int, int, int, long)", "No", 6, 12, 2, 2, 2, 0.01, 0.0]]}} +{"total": {"columns": [{"type": "string", "name": "name"}, {"type": "number", "name": "value"}], "rows": [["void cudnn::detail::dgrad_engine(int, int, int, float const*, int, float const*, int, float*, kernel_grad_params, unsigned long long, int, unsigned long long, int, float, int, int, int)", 80756], ["void cudnn::cnn::wgrad_alg0_engine(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 66472], ["void cudnn::bn_bw_1C11_kernel_new(float, float, float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float*, float*, float const*, float const*, float)", 59642], ["void at::native::vectorized_elementwise_kernel<4, at::native::AddFunctor, at::detail::Array >(int, at::native::AddFunctor, at::detail::Array)", 39814], ["void implicit_convolve_sgemm(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 36957], ["void at::native::vectorized_elementwise_kernel<4, at::native::threshold_kernel_impl(at::TensorIteratorBase&, float, float)::{lambda(float, float)#1}, at::detail::Array >(int, at::native::threshold_kernel_impl(at::TensorIteratorBase&, float, float)::{lambda(float, float)#1}, at::detail::Array)", 27298], ["void cudnn::bn_fw_tr_1C11_kernel_NCHW(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float const*, float, float, float*, float*, float*, float*, float, float)", 27060], ["void implicit_convolve_sgemm(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 25782], ["volta_sgemm_64x64_nt", 21084], ["volta_scudnn_128x128_stridedB_splitK_small_nn_v1", 20448], ["void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::clamp_min_scalar_kernel_impl(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::clamp_min_scalar_kernel_impl(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array)", 17860], ["volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1", 12704], ["volta_scudnn_128x64_stridedB_interior_nn_v1", 9597], ["volta_sgemm_128x32_nt", 8629], ["volta_sgemm_64x64_nn", 8551], ["volta_scudnn_128x64_relu_interior_nn_v1", 8022], ["volta_scudnn_128x64_stridedB_splitK_xregs_large_nn_v1", 7817], ["void cudnn::bn_fw_tr_1C11_singleread(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float const*, float, float, float*, float*, float*, float*, float, float, cudnn::reduced_divisor, int, cudnn::reduced_divisor, cudnn::bnFwPersistentState*, int, float, float, float, int, float, float, cudnnStatus_t*, bool)", 7185], ["void cudnn::cnn::wgrad_alg0_engine(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 7068], ["void cudnn::winograd_nonfused::winogradForwardOutput4x4(cudnn::winograd_nonfused::WinogradOutputParams)", 5369], ["void implicit_convolve_sgemm(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 5219], ["void explicit_convolve_sgemm(int, int, int, float const*, int, float const*, int, float*, kernel_conv_params, unsigned long long, int, unsigned long long, int, float, float, int, float const*, float const*)", 4759], ["void cudnn::winograd_nonfused::winogradForwardData4x4(cudnn::winograd_nonfused::WinogradDataParams)", 4710], ["volta_scudnn_128x128_stridedB_interior_nn_v1", 4693], ["void cudnn::winograd_nonfused::winogradWgradData4x4(cudnn::winograd_nonfused::WinogradDataParams)", 4692], ["void cudnn::ops::scalePackedTensor_kernel(cudnnTensor4dStruct, float*, float)", 4631], ["void cudnn::winograd_nonfused::winogradWgradDelta4x4(cudnn::winograd_nonfused::WinogradDeltaParams)", 4573], ["void cudnn::cnn::wgrad_alg0_engine(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 4065], ["void implicit_convolve_sgemm(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 3917], ["void at::native::(anonymous namespace)::max_pool_backward_nchw(int, float const*, long const*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, float*)", 3822], ["volta_scudnn_128x128_stridedB_splitK_medium_nn_v1", 3720], ["volta_scudnn_128x64_relu_medium_nn_v1", 3627], ["volta_scudnn_128x128_stridedB_medium_nn_v1", 3501], ["volta_scudnn_128x32_sliced1x4_ldg4_relu_exp_medium_nhwc_tn_v1", 3270], ["volta_scudnn_128x64_relu_small_nn_v1", 3265], ["volta_scudnn_128x64_relu_xregs_large_nn_v1", 3200], ["volta_sgemm_32x128_nn", 3053], ["volta_scudnn_128x128_relu_interior_nn_v1", 3010], ["volta_scudnn_128x128_stridedB_small_nn_v1", 2995], ["volta_sgemm_32x128_nt", 2843], ["void cudnn::winograd_nonfused::winogradForwardFilter4x4(cudnn::winograd_nonfused::WinogradFilterParams)", 2662], ["void at::native::vectorized_elementwise_kernel<4, at::native::MulScalarFunctor, at::detail::Array >(int, at::native::MulScalarFunctor, at::detail::Array)", 2409], ["void cudnn::bn_bw_1C11_singleread(float, float, float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float*, float*, float const*, float const*, float, cudnn::reduced_divisor, int, cudnn::reduced_divisor, cudnn::bnBwPersistentState*, int, float, float, float, int, float, cudnnStatus_t*, bool)", 2297], ["void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", 1887], ["void cudnn::winograd_nonfused::winogradWgradOutput4x4(cudnn::winograd_nonfused::WinogradWgradOutputParams)", 1504], ["void at::native::(anonymous namespace)::max_pool_forward_nchw(int, float const*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, float*, long*)", 1420], ["void cudnn::cnn::im2col4d_kernel(cudnn::cnn::im2col4d_params, cudnnConvolutionStruct, cudnnTensor4dStruct, float const*, float*)", 614], ["volta_scudnn_128x64_stridedB_small_nn_v1", 584], ["void nchwToNhwcKernel(int, int, int, int, float const*, float*, float, float)", 453], ["cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", 342], ["void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", 322], ["void at::native::reduce_kernel<512, 1, at::native::ReduceOp, unsigned int, float, 4> >(at::native::ReduceOp, unsigned int, float, 4>)", 212], ["volta_sgemm_64x32_sliced1x4_nn", 150], ["volta_sgemm_64x32_sliced1x4_tn", 149], ["void at::native::unrolled_elementwise_kernel, at::detail::Array, OffsetCalculator<1, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, at::native::MulScalarFunctor, at::detail::Array, OffsetCalculator<1, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast)", 144], ["void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams)", 134], ["void nhwcToNchwKernel(int, int, int, int, float const*, float*, float, float)", 105], ["cask_cudnn::computeWgradSplitKOffsetsKernel(cask_cudnn::ComputeSplitKOffsetsParams)", 81], ["cask_cudnn::computeWgradBOffsetsKernel(cask_cudnn::ComputeWgradBOffsetsParams)", 81], ["cask_cudnn::computeBOffsetsKernel(cask_cudnn::ComputeBOffsetsParams)", 73], ["void (anonymous namespace)::softmax_warp_backward(float*, float const*, float const*, int, int, int)", 53], ["void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", 44], ["void (anonymous namespace)::softmax_warp_forward(float*, float const*, int, int, int)", 42], ["void splitKreduce_kernel(cublasSplitKParams, float const*, float const*, float*, float const*, float const*, float const*)", 30], ["void at::native::unrolled_elementwise_kernel, OffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithoutCast, at::detail::Array::StoreWithoutCast>(int, at::native::copy_device_to_device(at::TensorIterator&, bool)::{lambda()#2}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array, OffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithoutCast, at::detail::Array::StoreWithoutCast)", 30], ["void cunn_ClassNLLCriterion_updateOutput_kernel(float*, float*, float*, long*, float*, int, int, int, int, long)", 18], ["void cunn_ClassNLLCriterion_updateGradInput_kernel(float*, float*, long*, float*, float*, int, int, int, int, long)", 12]]}} +{"steps": {"columns": [{"type": "string", "name": "Step"}, {"type": "number", "name": "Kernel"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Memcpy"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Memset"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Runtime"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "DataLoader"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "CPU Exec"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Other"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}], "rows": [["5", 99778, "
Step 5
Total: 182306us
Kernel: 99778us
Percentage: 54.73%
", 3606, "
Step 5
Total: 182306us
Memcpy: 3606us
Percentage: 1.98%
", 98, "
Step 5
Total: 182306us
Memset: 98us
Percentage: 0.05%
", 41028, "
Step 5
Total: 182306us
Runtime: 41028us
Percentage: 22.51%
", 4341, "
Step 5
Total: 182306us
DataLoader: 4341us
Percentage: 2.38%
", 27460, "
Step 5
Total: 182306us
CPU Exec: 27460us
Percentage: 15.06%
", 5995, "
Step 5
Total: 182306us
Other: 5995us
Percentage: 3.29%
"], ["6", 99208, "
Step 6
Total: 126183us
Kernel: 99208us
Percentage: 78.62%
", 2948, "
Step 6
Total: 126183us
Memcpy: 2948us
Percentage: 2.34%
", 98, "
Step 6
Total: 126183us
Memset: 98us
Percentage: 0.08%
", 3406, "
Step 6
Total: 126183us
Runtime: 3406us
Percentage: 2.7%
", 0, "
Step 6
Total: 126183us
DataLoader: 0us
Percentage: 0.0%
", 16404, "
Step 6
Total: 126183us
CPU Exec: 16404us
Percentage: 13.0%
", 4119, "
Step 6
Total: 126183us
Other: 4119us
Percentage: 3.26%
"], ["7", 99114, "
Step 7
Total: 127181us
Kernel: 99114us
Percentage: 77.93%
", 2949, "
Step 7
Total: 127181us
Memcpy: 2949us
Percentage: 2.32%
", 98, "
Step 7
Total: 127181us
Memset: 98us
Percentage: 0.08%
", 3417, "
Step 7
Total: 127181us
Runtime: 3417us
Percentage: 2.69%
", 6, "
Step 7
Total: 127181us
DataLoader: 6us
Percentage: 0.0%
", 19521, "
Step 7
Total: 127181us
CPU Exec: 19521us
Percentage: 15.35%
", 2076, "
Step 7
Total: 127181us
Other: 2076us
Percentage: 1.63%
"], ["8", 99021, "
Step 8
Total: 123079us
Kernel: 99021us
Percentage: 80.45%
", 2975, "
Step 8
Total: 123079us
Memcpy: 2975us
Percentage: 2.42%
", 97, "
Step 8
Total: 123079us
Memset: 97us
Percentage: 0.08%
", 3544, "
Step 8
Total: 123079us
Runtime: 3544us
Percentage: 2.88%
", 0, "
Step 8
Total: 123079us
DataLoader: 0us
Percentage: 0.0%
", 15464, "
Step 8
Total: 123079us
CPU Exec: 15464us
Percentage: 12.56%
", 1978, "
Step 8
Total: 123079us
Other: 1978us
Percentage: 1.61%
"], ["9", 98791, "
Step 9
Total: 163461us
Kernel: 98791us
Percentage: 60.44%
", 3596, "
Step 9
Total: 163461us
Memcpy: 3596us
Percentage: 2.2%
", 97, "
Step 9
Total: 163461us
Memset: 97us
Percentage: 0.06%
", 8275, "
Step 9
Total: 163461us
Runtime: 8275us
Percentage: 5.06%
", 1370, "
Step 9
Total: 163461us
DataLoader: 1370us
Percentage: 0.84%
", 43905, "
Step 9
Total: 163461us
CPU Exec: 43905us
Percentage: 26.86%
", 7427, "
Step 9
Total: 163461us
Other: 7427us
Percentage: 4.54%
"], ["10", 98956, "
Step 10
Total: 124198us
Kernel: 98956us
Percentage: 79.68%
", 2885, "
Step 10
Total: 124198us
Memcpy: 2885us
Percentage: 2.32%
", 98, "
Step 10
Total: 124198us
Memset: 98us
Percentage: 0.08%
", 3714, "
Step 10
Total: 124198us
Runtime: 3714us
Percentage: 2.99%
", 1400, "
Step 10
Total: 124198us
DataLoader: 1400us
Percentage: 1.13%
", 13235, "
Step 10
Total: 124198us
CPU Exec: 13235us
Percentage: 10.66%
", 3910, "
Step 10
Total: 124198us
Other: 3910us
Percentage: 3.15%
"]]}, "performance": [{"name": "Average Step Time", "description": "", "value": 141068, "extra": 100, "children": [{"name": "Kernel", "description": "", "value": 99145, "extra": 70.28}, {"name": "Memcpy", "description": "", "value": 3160, "extra": 2.24}, {"name": "Memset", "description": "", "value": 98, "extra": 0.07}, {"name": "Runtime", "description": "", "value": 10564, "extra": 7.49}, {"name": "DataLoader", "description": "", "value": 1186, "extra": 0.84}, {"name": "CPU Exec", "description": "", "value": 22665, "extra": 16.07}, {"name": "Other", "description": "", "value": 4251, "extra": 3.01}]}], "recommendations": "
  • Kernels with 68% time are launched by Tensor Cores eligible operators. You could enable Automatic Mixed Precision to speedup by using FP16.
", "environments": [{"title": "Number of Worker(s)", "value": "1"}, {"title": "Device Type", "value": "GPU"}], "gpu_metrics": {"title": "GPU Summary", "data": [{"title": "GPU 0:", "value": ""}, {"title": "Name", "value": "Tesla V100-DGXS-32GB"}, {"title": "Memory", "value": "31.74 GB"}, {"title": "Compute Capability", "value": "7.0"}, {"title": "GPU Utilization", "value": "70.27 %"}, {"title": "Est. SM Efficiency", "value": "69.22 %"}, {"title": "Est. Achieved Occupancy", "value": "48.91 %"}, {"title": "Kernel Time using Tensor Cores", "value": "0.0 %"}], "tooltip": "The GPU usage metrics:\n\nGPU Utilization:\nGPU busy time / All steps time. The higher, the better. GPU busy time is the time during which there is at least one GPU kernel running on it. All steps time is the total time of all profiler steps(or called as iterations).\n\nEst. SM Efficiency:\nEstimated Stream Multiprocessor Efficiency. The higher, the better. This metric of a kernel, SM_Eff_K = min(blocks of this kernel / SM number of this GPU, 100%). This overall number is the sum of all kernels' SM_Eff_K weighted by kernel's execution duration, divided by all steps time.\n\nEst. Achieved Occupancy:\nFor most cases such as memory bandwidth bounded kernels, the higher the better. Occupancy is the ratio of active warps on an SM to the maximum number of active warps supported by the SM. The theoretical occupancy of a kernel is upper limit occupancy of this kernel, limited by multiple factors such as kernel shape, kernel used resource, and the GPU compute capability.\nEst. Achieved Occupancy of a kernel, OCC_K = min(threads of the kernel / SM number / max threads per SM, theoretical occupancy of the kernel). This overall number is the weighted average of all kernels' OCC_K using kernel's execution duration as weight. It shows fine-grained low-level GPU utilization.\n\nKernel using Tensor Cores:\nTotal GPU Time for Tensor Core kernels / Total GPU Time for all kernels.\n"}} +{"device_total_time": {"title": "Device Total Time (us)", "columns": [{"type": "string", "name": "name"}, {"type": "number", "name": "value"}], "rows": [["aten::cudnn_convolution_backward", 274794], ["CudnnConvolutionBackward", 274794], ["aten::cudnn_convolution_backward_weight", 141300], ["aten::cudnn_convolution_backward_input", 133494], ["aten::cudnn_convolution", 128683], ["aten::_convolution", 128683], ["aten::convolution", 128683], ["aten::conv2d", 128683], ["aten::cudnn_batch_norm_backward", 61899], ["CudnnBatchNormBackward", 61899], ["aten::cudnn_batch_norm", 34315], ["aten::_batch_norm_impl_index", 34315], ["aten::batch_norm", 34315], ["aten::threshold_backward", 27280], ["ReluBackward1", 27280], ["aten::add_", 24052], ["aten::to", 18959], ["aten::copy_", 18959], ["aten::clamp_min", 17862], ["aten::clamp_min_", 17862], ["aten::relu_", 17862], ["aten::add", 16026], ["aten::max_pool2d_with_indices_backward", 4695], ["MaxPool2DWithIndicesBackward", 4695], ["torch::autograd::AccumulateGrad", 3012], ["aten::mul_", 2395], ["aten::fill_", 1888], ["aten::zero_", 1882], ["aten::max_pool2d_with_indices", 1422], ["aten::max_pool2d", 1422], ["aten::mm", 274], ["AddmmBackward", 274], ["aten::mean", 210], ["aten::adaptive_avg_pool2d", 210], ["aten::addmm", 197], ["aten::linear", 197], ["aten::div", 145], ["MeanBackward1", 145], ["aten::cross_entropy_loss", 60], ["aten::_log_softmax_backward_data", 51], ["LogSoftmaxBackward", 51], ["aten::sum", 45], ["aten::_log_softmax", 42], ["aten::log_softmax", 42], ["aten::nll_loss_forward", 18], ["aten::nll_loss", 18], ["aten::nll_loss_nd", 18], ["aten::nll_loss_backward", 18], ["NllLossBackward", 18], ["aten::ones_like", 6]]}, "device_self_time": {"title": "Device Self Time (us)", "columns": [{"type": "string", "name": "name"}, {"type": "number", "name": "value"}], "rows": [["aten::cudnn_convolution_backward_weight", 141300], ["aten::cudnn_convolution_backward_input", 133494], ["aten::cudnn_convolution", 128683], ["aten::cudnn_batch_norm_backward", 61899], ["aten::cudnn_batch_norm", 34315], ["aten::threshold_backward", 27280], ["aten::add_", 24052], ["aten::copy_", 18959], ["aten::clamp_min", 17862], ["aten::add", 16026], ["aten::max_pool2d_with_indices_backward", 3838], ["aten::mul_", 2395], ["aten::fill_", 1888], ["aten::max_pool2d_with_indices", 1422], ["aten::mm", 274], ["aten::mean", 210], ["aten::addmm", 197], ["aten::div", 145], ["aten::_log_softmax_backward_data", 51], ["aten::sum", 45], ["aten::_log_softmax", 42], ["aten::nll_loss_forward", 18], ["aten::nll_loss_backward", 18]]}, "host_total_time": {"title": "Host Total Time (us)", "columns": [{"type": "string", "name": "name"}, {"type": "number", "name": "value"}], "rows": [["CudnnConvolutionBackward", 119890], ["aten::cudnn_convolution_backward", 115797], ["aten::batch_norm", 105589], ["aten::add_", 97540], ["aten::_batch_norm_impl_index", 95925], ["aten::conv2d", 91000], ["aten::cudnn_batch_norm", 87823], ["aten::empty", 82024], ["aten::convolution", 81781], ["aten::_convolution", 74086], ["aten::cudnn_convolution", 64167], ["aten::cudnn_convolution_backward_weight", 60712], ["aten::to", 57776], ["aten::copy_", 56915], ["aten::cudnn_convolution_backward_input", 47359], ["CudnnBatchNormBackward", 41825], ["torch::autograd::AccumulateGrad", 37189], ["aten::cudnn_batch_norm_backward", 36641], ["aten::mul_", 35389], ["aten::relu_", 29432], ["aten::zero_", 28309], ["aten::add", 23831], ["aten::clamp_min_", 19059], ["aten::empty_like", 18591], ["aten::fill_", 17657], ["aten::resize_", 15019], ["ReluBackward1", 14944], ["aten::clamp_min", 12503], ["aten::threshold_backward", 12062], ["aten::view", 9046], ["AddmmBackward", 2026], ["aten::linear", 1463], ["aten::mm", 1424], ["aten::zeros", 1319], ["aten::cross_entropy_loss", 1225], ["aten::addmm", 1060], ["NllLossBackward", 889], ["aten::nll_loss_backward", 747], ["aten::t", 725], ["MeanBackward1", 663], ["aten::max_pool2d", 599], ["MaxPool2DWithIndicesBackward", 590], ["aten::adaptive_avg_pool2d", 581], ["aten::log_softmax", 580], ["aten::nll_loss_nd", 507], ["LogSoftmaxBackward", 500], ["aten::max_pool2d_with_indices_backward", 493], ["aten::ones_like", 470], ["aten::div", 469], ["aten::mean", 454], ["aten::empty_strided", 453], ["aten::_log_softmax_backward_data", 424], ["aten::max_pool2d_with_indices", 422], ["aten::_log_softmax", 420], ["aten::nll_loss", 418], ["aten::transpose", 413], ["aten::sum", 411], ["aten::nll_loss_forward", 343], ["aten::detach_", 323], ["aten::as_strided", 244], ["aten::expand", 237], ["aten::set_", 221], ["AddBackward0", 200], ["aten::flatten", 163], ["detach_", 156], ["TBackward", 151], ["ViewBackward", 132], ["aten::reshape", 88], ["aten::conj", 15]]}, "host_self_time": {"title": "Host Self Time (us)", "columns": [{"type": "string", "name": "name"}, {"type": "number", "name": "value"}], "rows": [["aten::empty", 82024], ["aten::add_", 62385], ["aten::cudnn_convolution", 35632], ["aten::cudnn_convolution_backward_input", 31902], ["aten::cudnn_convolution_backward_weight", 30672], ["aten::mul_", 24617], ["aten::cudnn_batch_norm", 23800], ["aten::add", 17808], ["aten::cudnn_batch_norm_backward", 15118], ["aten::resize_", 15019], ["aten::zero_", 10815], ["aten::relu_", 10373], ["aten::_convolution", 9919], ["aten::batch_norm", 9664], ["aten::fill_", 9660], ["aten::conv2d", 9219], ["aten::view", 9046], ["aten::clamp_min", 8409], ["aten::empty_like", 8385], ["aten::_batch_norm_impl_index", 8102], ["aten::threshold_backward", 7820], ["aten::cudnn_convolution_backward", 7726], ["aten::convolution", 7695], ["torch::autograd::AccumulateGrad", 7181], ["aten::clamp_min_", 6556], ["CudnnBatchNormBackward", 5184], ["CudnnConvolutionBackward", 4093], ["ReluBackward1", 2882], ["aten::mm", 1032], ["aten::zeros", 877], ["aten::addmm", 652], ["aten::to", 547], ["aten::nll_loss_backward", 463], ["aten::empty_strided", 453], ["aten::div", 343], ["aten::max_pool2d_with_indices", 325], ["aten::t", 312], ["aten::nll_loss_forward", 264], ["aten::transpose", 254], ["aten::as_strided", 244], ["AddmmBackward", 244], ["aten::mean", 233], ["aten::copy_", 230], ["aten::set_", 221], ["aten::max_pool2d_with_indices_backward", 213], ["aten::sum", 201], ["AddBackward0", 200], ["aten::max_pool2d", 177], ["aten::_log_softmax", 168], ["aten::detach_", 167], ["detach_", 156], ["aten::expand", 152], ["NllLossBackward", 142], ["aten::_log_softmax_backward_data", 142], ["aten::linear", 139], ["aten::cross_entropy_loss", 138], ["aten::adaptive_avg_pool2d", 127], ["aten::log_softmax", 106], ["MaxPool2DWithIndicesBackward", 97], ["aten::ones_like", 96], ["MeanBackward1", 95], ["aten::nll_loss_nd", 89], ["aten::flatten", 88], ["LogSoftmaxBackward", 76], ["aten::nll_loss", 75], ["ViewBackward", 44], ["aten::reshape", 43], ["TBackward", 33], ["aten::conj", 15]]}} +{"metadata": {"sort": "device_self_duration", "tooltips": {"tc_eligible": "Whether this operator is eligible to use Tensor Cores.", "tc_self_ratio": "Time of self-kernels with Tensor Cores / Time of self-kernels.", "tc_total_ratio": "Time of kernels with Tensor Cores / Time of kernels."}}, "data": [{"name": "aten::cudnn_convolution_backward_weight", "calls": 318, "device_self_duration": 141300, "device_total_duration": 141300, "host_self_duration": 30672, "host_total_duration": 60712, "tc_eligible": "Yes", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::cudnn_convolution_backward_input", "calls": 312, "device_self_duration": 133494, "device_total_duration": 133494, "host_self_duration": 31902, "host_total_duration": 47359, "tc_eligible": "Yes", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::cudnn_convolution", "calls": 318, "device_self_duration": 128683, "device_total_duration": 128683, "host_self_duration": 35632, "host_total_duration": 64167, "tc_eligible": "Yes", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::cudnn_batch_norm_backward", "calls": 318, "device_self_duration": 61899, "device_total_duration": 61899, "host_self_duration": 15118, "host_total_duration": 36641, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::cudnn_batch_norm", "calls": 318, "device_self_duration": 34315, "device_total_duration": 34315, "host_self_duration": 23800, "host_total_duration": 87823, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::threshold_backward", "calls": 294, "device_self_duration": 27280, "device_total_duration": 27280, "host_self_duration": 7820, "host_total_duration": 12062, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::add_", "calls": 2994, "device_self_duration": 24052, "device_total_duration": 24052, "host_self_duration": 62385, "host_total_duration": 97540, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::copy_", "calls": 12, "device_self_duration": 18959, "device_total_duration": 18959, "host_self_duration": 230, "host_total_duration": 56915, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::clamp_min", "calls": 294, "device_self_duration": 17862, "device_total_duration": 17862, "host_self_duration": 8409, "host_total_duration": 12503, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::add", "calls": 414, "device_self_duration": 16026, "device_total_duration": 16026, "host_self_duration": 17808, "host_total_duration": 23831, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::max_pool2d_with_indices_backward", "calls": 6, "device_self_duration": 3838, "device_total_duration": 4695, "host_self_duration": 213, "host_total_duration": 493, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::mul_", "calls": 966, "device_self_duration": 2395, "device_total_duration": 2395, "host_self_duration": 24617, "host_total_duration": 35389, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::fill_", "calls": 978, "device_self_duration": 1888, "device_total_duration": 1888, "host_self_duration": 9660, "host_total_duration": 17657, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::max_pool2d_with_indices", "calls": 6, "device_self_duration": 1422, "device_total_duration": 1422, "host_self_duration": 325, "host_total_duration": 422, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::mm", "calls": 12, "device_self_duration": 274, "device_total_duration": 274, "host_self_duration": 1032, "host_total_duration": 1424, "tc_eligible": "Yes", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::mean", "calls": 6, "device_self_duration": 210, "device_total_duration": 210, "host_self_duration": 233, "host_total_duration": 454, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::addmm", "calls": 6, "device_self_duration": 197, "device_total_duration": 197, "host_self_duration": 652, "host_total_duration": 1060, "tc_eligible": "Yes", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::div", "calls": 6, "device_self_duration": 145, "device_total_duration": 145, "host_self_duration": 343, "host_total_duration": 469, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::_log_softmax_backward_data", "calls": 6, "device_self_duration": 51, "device_total_duration": 51, "host_self_duration": 142, "host_total_duration": 424, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::sum", "calls": 6, "device_self_duration": 45, "device_total_duration": 45, "host_self_duration": 201, "host_total_duration": 411, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::_log_softmax", "calls": 6, "device_self_duration": 42, "device_total_duration": 42, "host_self_duration": 168, "host_total_duration": 420, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::nll_loss_forward", "calls": 6, "device_self_duration": 18, "device_total_duration": 18, "host_self_duration": 264, "host_total_duration": 343, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::nll_loss_backward", "calls": 6, "device_self_duration": 18, "device_total_duration": 18, "host_self_duration": 463, "host_total_duration": 747, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::empty", "calls": 4212, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 82024, "host_total_duration": 82024, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::zero_", "calls": 996, "device_self_duration": 0, "device_total_duration": 1882, "host_self_duration": 10815, "host_total_duration": 28309, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::zeros", "calls": 24, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 877, "host_total_duration": 1319, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::to", "calls": 36, "device_self_duration": 0, "device_total_duration": 18959, "host_self_duration": 547, "host_total_duration": 57776, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "detach_", "calls": 12, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 156, "host_total_duration": 156, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::detach_", "calls": 12, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 167, "host_total_duration": 323, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::set_", "calls": 12, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 221, "host_total_duration": 221, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::empty_strided", "calls": 18, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 453, "host_total_duration": 453, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::resize_", "calls": 1896, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 15019, "host_total_duration": 15019, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::_convolution", "calls": 318, "device_self_duration": 0, "device_total_duration": 128683, "host_self_duration": 9919, "host_total_duration": 74086, "tc_eligible": "Yes", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::convolution", "calls": 318, "device_self_duration": 0, "device_total_duration": 128683, "host_self_duration": 7695, "host_total_duration": 81781, "tc_eligible": "Yes", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::conv2d", "calls": 318, "device_self_duration": 0, "device_total_duration": 128683, "host_self_duration": 9219, "host_total_duration": 91000, "tc_eligible": "Yes", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::empty_like", "calls": 336, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 8385, "host_total_duration": 18591, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::view", "calls": 654, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 9046, "host_total_duration": 9046, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::_batch_norm_impl_index", "calls": 318, "device_self_duration": 0, "device_total_duration": 34315, "host_self_duration": 8102, "host_total_duration": 95925, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::batch_norm", "calls": 318, "device_self_duration": 0, "device_total_duration": 34315, "host_self_duration": 9664, "host_total_duration": 105589, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::clamp_min_", "calls": 294, "device_self_duration": 0, "device_total_duration": 17862, "host_self_duration": 6556, "host_total_duration": 19059, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::relu_", "calls": 294, "device_self_duration": 0, "device_total_duration": 17862, "host_self_duration": 10373, "host_total_duration": 29432, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::max_pool2d", "calls": 6, "device_self_duration": 0, "device_total_duration": 1422, "host_self_duration": 177, "host_total_duration": 599, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::adaptive_avg_pool2d", "calls": 6, "device_self_duration": 0, "device_total_duration": 210, "host_self_duration": 127, "host_total_duration": 581, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::flatten", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 88, "host_total_duration": 163, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::as_strided", "calls": 42, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 244, "host_total_duration": 244, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::transpose", "calls": 30, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 254, "host_total_duration": 413, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::t", "calls": 30, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 312, "host_total_duration": 725, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::expand", "calls": 12, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 152, "host_total_duration": 237, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::linear", "calls": 6, "device_self_duration": 0, "device_total_duration": 197, "host_self_duration": 139, "host_total_duration": 1463, "tc_eligible": "Yes", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::log_softmax", "calls": 6, "device_self_duration": 0, "device_total_duration": 42, "host_self_duration": 106, "host_total_duration": 580, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::nll_loss", "calls": 6, "device_self_duration": 0, "device_total_duration": 18, "host_self_duration": 75, "host_total_duration": 418, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::nll_loss_nd", "calls": 6, "device_self_duration": 0, "device_total_duration": 18, "host_self_duration": 89, "host_total_duration": 507, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::cross_entropy_loss", "calls": 6, "device_self_duration": 0, "device_total_duration": 60, "host_self_duration": 138, "host_total_duration": 1225, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::ones_like", "calls": 6, "device_self_duration": 0, "device_total_duration": 6, "host_self_duration": 96, "host_total_duration": 470, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "NllLossBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 18, "host_self_duration": 142, "host_total_duration": 889, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "LogSoftmaxBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 51, "host_self_duration": 76, "host_total_duration": 500, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::conj", "calls": 12, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 15, "host_total_duration": 15, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": false}, {"name": "AddmmBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 274, "host_self_duration": 244, "host_total_duration": 2026, "tc_eligible": "Yes", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "torch::autograd::AccumulateGrad", "calls": 966, "device_self_duration": 0, "device_total_duration": 3012, "host_self_duration": 7181, "host_total_duration": 37189, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "TBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 33, "host_total_duration": 151, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": false}, {"name": "aten::reshape", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 43, "host_total_duration": 88, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": false}, {"name": "ViewBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 44, "host_total_duration": 132, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": false}, {"name": "MeanBackward1", "calls": 6, "device_self_duration": 0, "device_total_duration": 145, "host_self_duration": 95, "host_total_duration": 663, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "ReluBackward1", "calls": 294, "device_self_duration": 0, "device_total_duration": 27280, "host_self_duration": 2882, "host_total_duration": 14944, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "AddBackward0", "calls": 96, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 200, "host_total_duration": 200, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": false}, {"name": "CudnnBatchNormBackward", "calls": 318, "device_self_duration": 0, "device_total_duration": 61899, "host_self_duration": 5184, "host_total_duration": 41825, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::cudnn_convolution_backward", "calls": 318, "device_self_duration": 0, "device_total_duration": 274794, "host_self_duration": 7726, "host_total_duration": 115797, "tc_eligible": "Yes", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "CudnnConvolutionBackward", "calls": 318, "device_self_duration": 0, "device_total_duration": 274794, "host_self_duration": 4093, "host_total_duration": 119890, "tc_eligible": "Yes", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "MaxPool2DWithIndicesBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 4695, "host_self_duration": 97, "host_total_duration": 590, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": false}]} +{"metadata": {"sort": "Total Duration (us)"}, "data": {"columns": [{"type": "string", "name": "Name"}, {"type": "string", "name": "Tensor Cores Used", "tooltip": "Whether this kernel uses Tensor Cores."}, {"type": "number", "name": "Calls"}, {"type": "number", "name": "Total Duration (us)"}, {"type": "number", "name": "Mean Duration (us)"}, {"type": "number", "name": "Max Duration (us)"}, {"type": "number", "name": "Min Duration (us)"}, {"type": "number", "name": "Mean Blocks Per SM", "tooltip": "Blocks Per SM = blocks of this kernel / SM number of this GPU.\nIf this number is less than 1, it indicates the GPU multiprocessors are not fully utilized.\n\"Mean Blocks per SM\" is the weighted average of all calls of this kernel, using each call's execution duration as weight."}, {"type": "number", "name": "Mean Est. Achieved Occupancy (%)", "tooltip": "Est. Achieved Occupancy:\nFor most cases such as memory bandwidth bounded kernels, the higher the better. Occupancy is the ratio of active warps on an SM to the maximum number of active warps supported by the SM. The theoretical occupancy of a kernel is upper limit occupancy of this kernel, limited by multiple factors such as kernel shape, kernel used resource, and the GPU compute capability.\nEst. Achieved Occupancy of a kernel, OCC_K = min(threads of the kernel / SM number / max threads per SM, theoretical occupancy of the kernel). This \"Mean\" number is the weighted average of all calls' OCC_K of the kernel, using each call's execution duration as weight. It shows fine-grained low-level GPU utilization."}], "rows": [["void cudnn::detail::dgrad_engine(int, int, int, float const*, int, float const*, int, float*, kernel_grad_params, unsigned long long, int, unsigned long long, int, float, int, int, int)", "No", 180, 86855, 483, 1023, 323, 45.33, 30.04], ["void cudnn::bn_bw_1C11_kernel_new(float, float, float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float*, float*, float const*, float const*, float)", "No", 264, 59568, 226, 923, 45, 4.33, 67.92], ["void cudnn::cnn::wgrad_alg0_engine(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", "No", 90, 43471, 483, 742, 363, 8.18, 38.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::AddFunctor, at::detail::Array >(int, at::native::AddFunctor, at::detail::Array)", "No", 3090, 39753, 13, 376, 1, 641.51, 92.35], ["void implicit_convolve_sgemm(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", "No", 90, 37016, 411, 735, 346, 12.39, 50.0], ["void implicit_convolve_sgemm(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", "No", 72, 35106, 488, 822, 350, 3.83, 41.64], ["void at::native::vectorized_elementwise_kernel<4, at::native::threshold_kernel_impl(at::TensorIteratorBase&, float, float)::{lambda(float, float)#1}, at::detail::Array >(int, at::native::threshold_kernel_impl(at::TensorIteratorBase&, float, float)::{lambda(float, float)#1}, at::detail::Array)", "No", 294, 27280, 93, 377, 13, 653.26, 100.0], ["void cudnn::bn_fw_tr_1C11_kernel_NCHW(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float const*, float, float, float*, float*, float*, float*, float, float)", "No", 150, 27084, 181, 454, 53, 3.12, 64.02], ["volta_scudnn_128x128_stridedB_splitK_medium_nn_v1", "No", 72, 25342, 352, 629, 323, 3.21, 25.0], ["volta_sgemm_64x64_nt", "No", 102, 21125, 207, 281, 184, 10.28, 19.38], ["volta_scudnn_128x128_stridedB_splitK_small_nn_v1", "No", 48, 20473, 427, 681, 309, 6.82, 25.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::clamp_min_scalar_kernel_impl(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::clamp_min_scalar_kernel_impl(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array)", "No", 294, 17862, 61, 252, 5, 666.77, 100.0], ["volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1", "No", 36, 12761, 354, 365, 344, 22.4, 25.0], ["volta_scudnn_128x64_stridedB_interior_nn_v1", "No", 30, 9559, 319, 508, 255, 12.91, 19.0], ["volta_sgemm_128x32_nt", "No", 24, 8658, 361, 479, 18, 0.97, 11.51], ["volta_sgemm_64x64_nn", "No", 42, 8544, 203, 210, 197, 12.35, 24.14], ["volta_scudnn_128x64_relu_interior_nn_v1", "No", 30, 7976, 266, 316, 92, 37.08, 25.0], ["void implicit_convolve_sgemm(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", "No", 12, 7939, 662, 733, 584, 7.54, 25.0], ["volta_scudnn_128x64_stridedB_splitK_xregs_large_nn_v1", "No", 12, 7819, 652, 670, 634, 15.96, 19.0], ["void cudnn::bn_fw_tr_1C11_singleread(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float const*, float, float, float*, float*, float*, float*, float, float, cudnn::reduced_divisor, int, cudnn::reduced_divisor, cudnn::bnFwPersistentState*, int, float, float, float, int, float, float, cudnnStatus_t*, bool)", "No", 168, 7231, 43, 89, 11, 12.63, 75.0], ["void cudnn::cnn::wgrad_alg0_engine(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", "No", 12, 7068, 589, 990, 192, 85.38, 37.51], ["void cudnn::ops::scalePackedTensor_kernel(cudnnTensor4dStruct, float*, float)", "No", 180, 5901, 33, 142, 5, 525.02, 100.0], ["void cudnn::winograd_nonfused::winogradForwardOutput4x4(cudnn::winograd_nonfused::WinogradOutputParams)", "No", 120, 5314, 44, 72, 20, 10.02, 50.0], ["void implicit_convolve_sgemm(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", "No", 12, 5221, 435, 440, 431, 9.8, 31.0], ["void cudnn::winograd_nonfused::winogradWgradData4x4(cudnn::winograd_nonfused::WinogradDataParams)", "No", 78, 4681, 60, 126, 20, 15.46, 38.0], ["void cudnn::winograd_nonfused::winogradForwardData4x4(cudnn::winograd_nonfused::WinogradDataParams)", "No", 120, 4648, 39, 67, 17, 10.15, 50.0], ["void cudnn::winograd_nonfused::winogradWgradDelta4x4(cudnn::winograd_nonfused::WinogradDeltaParams)", "No", 78, 4559, 58, 126, 17, 15.71, 50.0], ["void cudnn::cnn::wgrad_alg0_engine(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", "No", 6, 4038, 673, 691, 649, 6.4, 25.0], ["void at::native::(anonymous namespace)::max_pool_backward_nchw(int, float const*, long const*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, float*)", "No", 6, 3838, 640, 643, 637, 1254.4, 100.0], ["volta_scudnn_128x32_sliced1x4_ldg4_relu_exp_small_nhwc_tn_v1", "No", 6, 3697, 616, 621, 614, 2.6, 25.0], ["volta_scudnn_128x64_relu_medium_nn_v1", "No", 6, 3647, 608, 620, 602, 39.2, 25.0], ["volta_scudnn_128x128_stridedB_medium_nn_v1", "No", 12, 3550, 296, 309, 286, 19.6, 25.0], ["volta_scudnn_128x64_relu_small_nn_v1", "No", 12, 3273, 273, 286, 258, 9.8, 25.0], ["volta_sgemm_32x128_nn", "No", 18, 3059, 170, 173, 167, 22.05, 50.0], ["volta_scudnn_128x128_stridedB_small_nn_v1", "No", 6, 3034, 506, 520, 491, 19.6, 25.0], ["volta_sgemm_32x128_nt", "No", 18, 2837, 158, 159, 156, 22.05, 50.0], ["void cudnn::winograd_nonfused::winogradForwardFilter4x4(cudnn::winograd_nonfused::WinogradFilterParams)", "No", 120, 2632, 22, 67, 4, 8.75, 73.78], ["void at::native::vectorized_elementwise_kernel<4, at::native::MulScalarFunctor, at::detail::Array >(int, at::native::MulScalarFunctor, at::detail::Array)", "No", 966, 2395, 2, 25, 1, 44.01, 58.56], ["void cudnn::bn_bw_1C11_singleread(float, float, float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float*, float*, float const*, float const*, float, cudnn::reduced_divisor, int, cudnn::reduced_divisor, cudnn::bnBwPersistentState*, int, float, float, float, int, float, cudnnStatus_t*, bool)", "No", 54, 2331, 43, 75, 19, 20.83, 75.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", "No", 978, 1888, 2, 143, 0, 600.2, 86.95], ["void cudnn::winograd_nonfused::winogradWgradOutput4x4(cudnn::winograd_nonfused::WinogradWgradOutputParams)", "No", 78, 1484, 19, 69, 3, 8.13, 41.71], ["void at::native::(anonymous namespace)::max_pool_forward_nchw(int, float const*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, float*, long*)", "No", 6, 1422, 237, 243, 234, 313.6, 100.0], ["volta_scudnn_128x64_stridedB_small_nn_v1", "No", 6, 582, 97, 99, 94, 9.8, 19.0], ["void nchwToNhwcKernel(int, int, int, int, float const*, float*, float, float)", "No", 12, 383, 32, 34, 29, 71.72, 100.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", "No", 318, 325, 1, 2, 1, 0.01, 0.0], ["cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "No", 108, 216, 2, 5, 1, 0.16, 2.0], ["void at::native::reduce_kernel<512, 1, at::native::ReduceOp, unsigned int, float, 4> >(at::native::ReduceOp, unsigned int, float, 4>)", "No", 6, 210, 35, 35, 35, 51.2, 100.0], ["cask_cudnn::computeWgradSplitKOffsetsKernel(cask_cudnn::ComputeSplitKOffsetsParams)", "No", 132, 155, 1, 2, 1, 0.16, 1.83], ["cask_cudnn::computeWgradBOffsetsKernel(cask_cudnn::ComputeWgradBOffsetsParams)", "No", 132, 150, 1, 2, 1, 0.02, 0.0], ["volta_sgemm_64x32_sliced1x4_nn", "No", 6, 149, 25, 25, 24, 2.0, 25.0], ["volta_sgemm_64x32_sliced1x4_tn", "No", 6, 148, 25, 25, 24, 1.0, 13.0], ["void at::native::unrolled_elementwise_kernel, at::detail::Array, OffsetCalculator<1, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, at::native::MulScalarFunctor, at::detail::Array, OffsetCalculator<1, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast)", "No", 6, 145, 24, 25, 24, 156.8, 100.0], ["void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams)", "No", 36, 126, 4, 5, 2, 0.4, 3.0], ["cask_cudnn::computeBOffsetsKernel(cask_cudnn::ComputeBOffsetsParams)", "No", 54, 57, 1, 2, 1, 0.02, 0.0], ["void nhwcToNchwKernel(int, int, int, int, float const*, float*, float, float)", "No", 6, 54, 9, 10, 8, 12.8, 100.0], ["void (anonymous namespace)::softmax_warp_backward(float*, float const*, float const*, int, int, int)", "No", 6, 51, 8, 9, 8, 0.1, 1.0], ["void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "No", 6, 45, 8, 8, 7, 0.03, 0.0], ["void (anonymous namespace)::softmax_warp_forward(float*, float const*, int, int, int)", "No", 6, 42, 7, 7, 7, 0.1, 1.0], ["void splitKreduce_kernel(cublasSplitKParams, float const*, float const*, float*, float const*, float const*, float const*)", "No", 12, 31, 3, 4, 2, 4.39, 27.74], ["void at::native::unrolled_elementwise_kernel, OffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithoutCast, at::detail::Array::StoreWithoutCast>(int, at::native::copy_device_to_device(at::TensorIterator&, bool)::{lambda()#2}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array, OffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithoutCast, at::detail::Array::StoreWithoutCast)", "No", 6, 30, 5, 5, 5, 1.56, 5.0], ["void cunn_ClassNLLCriterion_updateOutput_kernel(float*, float*, float*, long*, float*, int, int, int, int, long)", "No", 6, 18, 3, 3, 3, 0.01, 0.0], ["void cunn_ClassNLLCriterion_updateGradInput_kernel(float*, float*, long*, float*, float*, int, int, int, int, long)", "No", 6, 12, 2, 2, 2, 0.01, 0.0]]}} +{"total": {"columns": [{"type": "string", "name": "name"}, {"type": "number", "name": "value"}], "rows": [["void cudnn::detail::dgrad_engine(int, int, int, float const*, int, float const*, int, float*, kernel_grad_params, unsigned long long, int, unsigned long long, int, float, int, int, int)", 86855], ["void cudnn::bn_bw_1C11_kernel_new(float, float, float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float*, float*, float const*, float const*, float)", 59568], ["void cudnn::cnn::wgrad_alg0_engine(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 43471], ["void at::native::vectorized_elementwise_kernel<4, at::native::AddFunctor, at::detail::Array >(int, at::native::AddFunctor, at::detail::Array)", 39753], ["void implicit_convolve_sgemm(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 37016], ["void implicit_convolve_sgemm(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 35106], ["void at::native::vectorized_elementwise_kernel<4, at::native::threshold_kernel_impl(at::TensorIteratorBase&, float, float)::{lambda(float, float)#1}, at::detail::Array >(int, at::native::threshold_kernel_impl(at::TensorIteratorBase&, float, float)::{lambda(float, float)#1}, at::detail::Array)", 27280], ["void cudnn::bn_fw_tr_1C11_kernel_NCHW(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float const*, float, float, float*, float*, float*, float*, float, float)", 27084], ["volta_scudnn_128x128_stridedB_splitK_medium_nn_v1", 25342], ["volta_sgemm_64x64_nt", 21125], ["volta_scudnn_128x128_stridedB_splitK_small_nn_v1", 20473], ["void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::clamp_min_scalar_kernel_impl(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array >(int, at::native::(anonymous namespace)::clamp_min_scalar_kernel_impl(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array)", 17862], ["volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1", 12761], ["volta_scudnn_128x64_stridedB_interior_nn_v1", 9559], ["volta_sgemm_128x32_nt", 8658], ["volta_sgemm_64x64_nn", 8544], ["volta_scudnn_128x64_relu_interior_nn_v1", 7976], ["void implicit_convolve_sgemm(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 7939], ["volta_scudnn_128x64_stridedB_splitK_xregs_large_nn_v1", 7819], ["void cudnn::bn_fw_tr_1C11_singleread(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float const*, float, float, float*, float*, float*, float*, float, float, cudnn::reduced_divisor, int, cudnn::reduced_divisor, cudnn::bnFwPersistentState*, int, float, float, float, int, float, float, cudnnStatus_t*, bool)", 7231], ["void cudnn::cnn::wgrad_alg0_engine(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 7068], ["void cudnn::ops::scalePackedTensor_kernel(cudnnTensor4dStruct, float*, float)", 5901], ["void cudnn::winograd_nonfused::winogradForwardOutput4x4(cudnn::winograd_nonfused::WinogradOutputParams)", 5314], ["void implicit_convolve_sgemm(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 5221], ["void cudnn::winograd_nonfused::winogradWgradData4x4(cudnn::winograd_nonfused::WinogradDataParams)", 4681], ["void cudnn::winograd_nonfused::winogradForwardData4x4(cudnn::winograd_nonfused::WinogradDataParams)", 4648], ["void cudnn::winograd_nonfused::winogradWgradDelta4x4(cudnn::winograd_nonfused::WinogradDeltaParams)", 4559], ["void cudnn::cnn::wgrad_alg0_engine(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 4038], ["void at::native::(anonymous namespace)::max_pool_backward_nchw(int, float const*, long const*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, float*)", 3838], ["volta_scudnn_128x32_sliced1x4_ldg4_relu_exp_small_nhwc_tn_v1", 3697], ["volta_scudnn_128x64_relu_medium_nn_v1", 3647], ["volta_scudnn_128x128_stridedB_medium_nn_v1", 3550], ["volta_scudnn_128x64_relu_small_nn_v1", 3273], ["volta_sgemm_32x128_nn", 3059], ["volta_scudnn_128x128_stridedB_small_nn_v1", 3034], ["volta_sgemm_32x128_nt", 2837], ["void cudnn::winograd_nonfused::winogradForwardFilter4x4(cudnn::winograd_nonfused::WinogradFilterParams)", 2632], ["void at::native::vectorized_elementwise_kernel<4, at::native::MulScalarFunctor, at::detail::Array >(int, at::native::MulScalarFunctor, at::detail::Array)", 2395], ["void cudnn::bn_bw_1C11_singleread(float, float, float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float*, float*, float const*, float const*, float, cudnn::reduced_divisor, int, cudnn::reduced_divisor, cudnn::bnBwPersistentState*, int, float, float, float, int, float, cudnnStatus_t*, bool)", 2331], ["void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at::native::FillFunctor, at::detail::Array)", 1888], ["void cudnn::winograd_nonfused::winogradWgradOutput4x4(cudnn::winograd_nonfused::WinogradWgradOutputParams)", 1484], ["void at::native::(anonymous namespace)::max_pool_forward_nchw(int, float const*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, float*, long*)", 1422], ["volta_scudnn_128x64_stridedB_small_nn_v1", 582], ["void nchwToNhwcKernel(int, int, int, int, float const*, float*, float, float)", 383], ["void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, at::detail::Array >(int, at::native::BUnaryFunctor >, at::detail::Array)", 325], ["cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", 216], ["void at::native::reduce_kernel<512, 1, at::native::ReduceOp, unsigned int, float, 4> >(at::native::ReduceOp, unsigned int, float, 4>)", 210], ["cask_cudnn::computeWgradSplitKOffsetsKernel(cask_cudnn::ComputeSplitKOffsetsParams)", 155], ["cask_cudnn::computeWgradBOffsetsKernel(cask_cudnn::ComputeWgradBOffsetsParams)", 150], ["volta_sgemm_64x32_sliced1x4_nn", 149], ["volta_sgemm_64x32_sliced1x4_tn", 148], ["void at::native::unrolled_elementwise_kernel, at::detail::Array, OffsetCalculator<1, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, at::native::MulScalarFunctor, at::detail::Array, OffsetCalculator<1, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast)", 145], ["void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams)", 126], ["cask_cudnn::computeBOffsetsKernel(cask_cudnn::ComputeBOffsetsParams)", 57], ["void nhwcToNchwKernel(int, int, int, int, float const*, float*, float, float)", 54], ["void (anonymous namespace)::softmax_warp_backward(float*, float const*, float const*, int, int, int)", 51], ["void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", 45], ["void (anonymous namespace)::softmax_warp_forward(float*, float const*, int, int, int)", 42], ["void splitKreduce_kernel(cublasSplitKParams, float const*, float const*, float*, float const*, float const*, float const*)", 31], ["void at::native::unrolled_elementwise_kernel, OffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithoutCast, at::detail::Array::StoreWithoutCast>(int, at::native::copy_device_to_device(at::TensorIterator&, bool)::{lambda()#2}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array, OffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithoutCast, at::detail::Array::StoreWithoutCast)", 30], ["void cunn_ClassNLLCriterion_updateOutput_kernel(float*, float*, float*, long*, float*, int, int, int, int, long)", 18], ["void cunn_ClassNLLCriterion_updateGradInput_kernel(float*, float*, long*, float*, float*, int, int, int, int, long)", 12]]}} diff --git a/tb_plugins/profiling/tb_plugin/test/test_compare_with_autograd.py b/tb_plugins/profiling/tb_plugin/test/test_compare_with_autograd.py new file mode 100644 index 0000000000000000000000000000000000000000..d097fbd3ccc52e9d750b8c8f618198393b5ebc7b --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/test/test_compare_with_autograd.py @@ -0,0 +1,301 @@ +import os +import time +import unittest +import pytest +import torch +import torch.nn as nn +import torch.backends.cudnn as cudnn +import torch.optim +import torch.utils.data +import torchvision +import torchvision.transforms as T +import torchvision.models as models +import torch_tb_profiler.io as io +from torch_tb_profiler.profiler import RunLoader + + +def create_log_dir(): + log_dir_name = './log{}'.format(str(int(time.time()*1000))) + try: + os.makedirs(log_dir_name) + except Exception: + raise RuntimeError("Can't create directory: " + log_dir_name) + return log_dir_name + + +def get_autograd_result(p, worker_name, record_shapes=False, with_stack=False): + avgs = p.key_averages() + sort_by = 'self_cuda_time_total' + avgs = sorted( + avgs, key=lambda evt: getattr(evt, sort_by), reverse=True + ) + is_gpu = False + if avgs[0].self_cuda_time_total > 0: + is_gpu = True + others_prefix = {'enumerate(DataLoader)#', 'Optimizer.zero_grad#', 'Optimizer.step#', + 'ProfilerStep*', + 'Memcpy', 'Memset', + 'cuda'} + postfix_to_type = {'CPU': 'operator', 'CUDA': 'kernel'} + + def get_type(evt): + s = str(evt.device_type) + postfix = s[s.index('.') + 1:] + evt_type = postfix_to_type[postfix] + for prefix in others_prefix: + if evt.key.startswith(prefix): + evt_type = 'Other' + break + return evt_type + + result_dict = dict() + result_dict[worker_name + '#operator'] = list() + if is_gpu: + result_dict[worker_name + '#kernel'] = list() + for avg in avgs: + evt_type = get_type(avg) + if evt_type == 'operator': + line = [avg.key, int(avg.count)] + if is_gpu: + line.extend([int(avg.self_cuda_time_total), int(avg.cuda_time_total)]) + line.extend([int(avg.self_cpu_time_total), int(avg.cpu_time_total)]) + result_dict[worker_name + '#operator'].append(line) + elif is_gpu and evt_type == 'kernel': + line = [avg.key, int(avg.count), int(avg.self_cuda_time_total)] + result_dict[worker_name + '#kernel'].append(line) + if record_shapes: + result_dict[worker_name + '#operator#input_shape'] = list() + avgs = p.key_averages(True) + sort_by = 'self_cuda_time_total' + avgs = sorted( + avgs, key=lambda evt: getattr(evt, sort_by), reverse=True + ) + for avg in avgs: + evt_type = get_type(avg) + if evt_type == 'operator': + line = [avg.key, str(avg.input_shapes) if avg.input_shapes else '[]', int(avg.count)] + if is_gpu: + line.extend([int(avg.self_cuda_time_total), int(avg.cuda_time_total)]) + line.extend([int(avg.self_cpu_time_total), int(avg.cpu_time_total)]) + result_dict[worker_name + '#operator#input_shape'].append(line) + # The call stack for legacy and kineto profiler is different for now, + # The legacy profiler has stack for backward while kineto not + # So, just disable call stack compare for the moment + if False and with_stack: + result_dict[worker_name + '#operator#stack'] = list() + avgs = p.key_averages(False, 100) + sort_by = 'self_cuda_time_total' + avgs = sorted( + avgs, key=lambda evt: getattr(evt, sort_by), reverse=True + ) + for avg in avgs: + evt_type = get_type(avg) + if evt_type == 'operator' and avg.stack: + line = [avg.key, int(avg.count)] + if is_gpu: + line.extend([int(avg.self_cuda_time_total), int(avg.cuda_time_total)]) + line.extend([int(avg.self_cpu_time_total), int(avg.cpu_time_total), ''.join(avg.stack)]) + result_dict[worker_name + '#operator#stack'].append(line) + + result_dict[worker_name + '#operator#stack#input_shape'] = list() + avgs = p.key_averages(True, 100) + sort_by = 'self_cuda_time_total' + avgs = sorted( + avgs, key=lambda evt: getattr(evt, sort_by), reverse=True + ) + for avg in avgs: + evt_type = get_type(avg) + if evt_type == 'operator' and avg.stack: + line = [avg.key, str(avg.input_shapes), int(avg.count)] + if is_gpu: + line.extend([int(avg.self_cuda_time_total), int(avg.cuda_time_total)]) + line.extend([int(avg.self_cpu_time_total), int(avg.cpu_time_total), ''.join(avg.stack)]) + result_dict[worker_name + '#operator#stack#input_shape'].append(line) + + return result_dict + + +def generate_plugin_result_row(data): + row = list() + row.append(data['name']) + if 'input_shape' in data: + row.append(data['input_shape']) + row.append(data['calls']) + if 'device_self_duration' in data: + row.append(data['device_self_duration']) + row.append(data['device_total_duration']) + row.extend([data['host_self_duration'], data['host_total_duration']]) + if 'call_stack' in data: + row.append(data['call_stack']) + return row + + +def get_plugin_result(run, record_shapes=False, with_stack=False): + result_dict = dict() + for (worker_name, span), profile in run.profiles.items(): + worker_name = worker_name.split('.')[0] + assert profile.operation_table_by_name is not None + result_dict[worker_name + '#operator'] = list() + for data in profile.operation_table_by_name['data']: + row = generate_plugin_result_row(data) + result_dict[worker_name + '#operator'].append(row) + if profile.kernel_table is not None: + rows = profile.kernel_table['data']['rows'] + result_dict[worker_name + '#kernel'] = list() + for row in rows: + result_dict[worker_name + '#kernel'].append([row[0], row[2], row[3]]) # row[1] is 'Tensor Cores Used'. + if record_shapes: + assert profile.operation_table_by_name_input is not None + result_dict[worker_name + '#operator#input_shape'] = list() + for data in profile.operation_table_by_name_input['data']: + row = generate_plugin_result_row(data) + result_dict[worker_name + '#operator#input_shape'].append(row) + # The call stack for legacy and kineto profiler is different for now, + # The legacy profiler has stack for backward while kineto not + # So, just disable call stack compare for the moment + if False and with_stack: + assert profile.operation_stack_by_name is not None + assert profile.operation_stack_by_name_input is not None + result_dict[worker_name + '#operator#stack'] = list() + op_stack_dict = profile.operation_stack_by_name + for k, datalist in op_stack_dict.items(): + for data in datalist: + row = generate_plugin_result_row(data) + result_dict[worker_name + '#operator#stack'].append(row) + if record_shapes: + result_dict[worker_name + '#operator#stack#input_shape'] = list() + op_stack_dict = profile.operation_stack_by_name_input + for k, datalist in op_stack_dict.items(): + for data in datalist: + row = generate_plugin_result_row(data) + result_dict[worker_name + '#operator#stack#input_shape'].append(row) + + return result_dict + + +def get_train_func(use_gpu=True): + model = models.resnet50(pretrained=True) + if use_gpu: + model.cuda() + cudnn.benchmark = True + + transform = T.Compose([T.Resize(256), T.CenterCrop(224), T.ToTensor()]) + trainset = torchvision.datasets.CIFAR10(root='./data', train=True, + download=True, transform=transform) + trainloader = torch.utils.data.DataLoader(trainset, batch_size=2, + shuffle=True, num_workers=0) + + if use_gpu: + criterion = nn.CrossEntropyLoss().cuda() + else: + criterion = nn.CrossEntropyLoss() + optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9) + if use_gpu: + device = torch.device('cuda:0') + else: + device = torch.device('cpu') + model.train() + + def train(train_step, prof=None): + for step, data in enumerate(trainloader, 0): + print('step:{}'.format(step)) + inputs, labels = data[0].to(device=device), data[1].to(device=device) + + outputs = model(inputs) + loss = criterion(outputs, labels) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + if prof is not None: + prof.step() + if step >= train_step: + break + return train + + +def get_output_fn(dir_name, profilers_dict): + def output_fn(p): + # In current torch.profiler.profile, at beginning of each span, a new p.profiler will be created. + # So the same p.profiler will not be shared among different spans + worker_name = 'worker{}'.format(p.step_num) + profilers_dict[worker_name] = p.profiler + tb_trace_handler = torch.profiler.tensorboard_trace_handler(dir_name, worker_name) + tb_trace_handler(p) + return output_fn + + +class TestCompareWithAutogradResult(unittest.TestCase): + + def compare_results(self, log_dir, profilers_dict, use_gpu=True, record_shapes=False, with_stack=False): + cache = io.Cache() + loader = RunLoader(os.path.split(log_dir)[-1], log_dir, cache) + run = loader.load() + plugin_result = get_plugin_result(run, record_shapes, with_stack) + count = 0 + for worker_name, p in profilers_dict.items(): + autograd_result = get_autograd_result(p, worker_name, record_shapes, with_stack) + for key in autograd_result.keys(): + count += 1 + self.assertTrue(key in plugin_result.keys()) + self.assertEqual(len(plugin_result[key]), len(autograd_result[key])) + for line in plugin_result[key]: + self.assertTrue(line in autograd_result[key]) + self.assertEqual(count, len(plugin_result.keys())) + + @pytest.mark.skipif(not torch.cuda.is_available(), reason='') + def test_autograd_api(self): + with torch.autograd.profiler.profile(use_cuda=True, use_kineto=True, record_shapes=True) as p: + get_train_func()(5) + log_dir = create_log_dir() + p.export_chrome_trace(os.path.join(log_dir, 'worker0.{}.pt.trace.json'.format(int(time.time() * 1000)))) + self.compare_results(log_dir, {'worker0': p}) + + def base_profiler_api(self, use_gpu, record_shapes, profile_memory, with_stack): + log_dir = create_log_dir() + profilers_dict = dict() + if use_gpu: + activities = [ + torch.profiler.ProfilerActivity.CPU, + torch.profiler.ProfilerActivity.CUDA] + else: + activities = [torch.profiler.ProfilerActivity.CPU] + + with torch.profiler.profile( + activities=activities, + schedule=torch.profiler.schedule( + wait=2, + warmup=2, + active=3), + on_trace_ready=get_output_fn(log_dir, profilers_dict), + record_shapes=record_shapes, + profile_memory=profile_memory, + with_stack=with_stack + ) as p: + get_train_func(use_gpu)(13, p) + self.compare_results(log_dir, profilers_dict, use_gpu, record_shapes, with_stack) + + def test_profiler_api_without_gpu(self): + self.base_profiler_api(False, True, True, False) + + @pytest.mark.skipif(not torch.cuda.is_available(), reason='') + def test_profiler_api_with_record_shapes_memory_stack(self): + self.base_profiler_api(True, True, True, True) + + @pytest.mark.skipif(not torch.cuda.is_available(), reason='') + def test_profiler_api_without_record_shapes_memory_stack(self): + self.base_profiler_api(True, False, False, False) + + @pytest.mark.skipif(not torch.cuda.is_available(), reason='') + def test_profiler_api_without_step(self): + log_dir = create_log_dir() + profilers_dict = dict() + with torch.profiler.profile( + activities=[ + torch.profiler.ProfilerActivity.CPU, + torch.profiler.ProfilerActivity.CUDA], + on_trace_ready=get_output_fn(log_dir, profilers_dict), + record_shapes=True + ): + get_train_func()(7) + self.compare_results(log_dir, profilers_dict) diff --git a/tb_plugins/profiling/tb_plugin/test/test_diffrun.py b/tb_plugins/profiling/tb_plugin/test/test_diffrun.py new file mode 100644 index 0000000000000000000000000000000000000000..f1bebbfbfa4f2d4b983b2f439b81548a07b51618 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/test/test_diffrun.py @@ -0,0 +1,51 @@ +import os +import unittest + +import pytest +from torch_tb_profiler.profiler.data import RunProfileData +from torch_tb_profiler.profiler.diffrun import (compare_op_tree, diff_summary, + print_node, print_ops) +from torch_tb_profiler.utils import timing + + +def load_profile(worker, span, path): + return RunProfileData.parse(worker, span, path, '.') + + +class TestDiffRun(unittest.TestCase): + + @pytest.mark.skipif(not (os.path.isfile(os.path.expanduser('~/profile_result/worker0.pt.trace.json')) and + os.path.isfile(os.path.expanduser('~/profile_result/worker1.pt.trace.json'))), + reason="file doesn't exist") + def test_happy_path(self): + # path1 = os.path.expanduser('~/profile_result/worker0.pt.trace.json') + path1 = '/home/mike/git/kineto/tb_plugin/examples/result/datapipe0.1638760942588.pt.trace.json' + profile1 = load_profile('worker0', 1, path1) + roots = list(profile1.tid2tree.values()) + root = roots[0] + + # path2 = os.path.expanduser('~/profile_result/worker1.pt.trace.json') + path2 = '/home/mike/git/kineto/tb_plugin/examples/result/datapipe0.1638835897553.pt.trace.json' + profile2 = load_profile('worker0', 1, path2) + roots1 = list(profile2.tid2tree.values()) + root1 = roots1[0] + + with timing('Compare operator tree', True): + node = compare_op_tree(root, root1) + + print_ops(node.children[4].left, prefix=' ') + print('========================================================') + print_ops(node.children[4].right) + + print('*********************** summary *************************') + with timing('Diff summary', True): + stats = diff_summary(node) + + # result = stats.flatten_diff_tree() + # path = '0-1-1' + # json_data = result[path].get_diff_node_summary(path) + print_node(stats, 0, 0) + + +if __name__ == '__main__': + unittest.main() diff --git a/tb_plugins/profiling/tb_plugin/test/test_profiler.py b/tb_plugins/profiling/tb_plugin/test/test_profiler.py new file mode 100644 index 0000000000000000000000000000000000000000..fda2208aab5f786af3fa1dbc08efdd43653073a2 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/test/test_profiler.py @@ -0,0 +1,2752 @@ +import gzip +import json +import os +import unittest + +from torch_tb_profiler.profiler.data import (DistributedRunProfileData, + RunProfileData) +from torch_tb_profiler.profiler.loader import RunLoader +from torch_tb_profiler.profiler.overall_parser import ProfileRole +from torch_tb_profiler.profiler.gpu_metrics_parser import GPUMetricsParser +from torch_tb_profiler.run import RunProfile + +SCHEMA_VERSION = 1 +WORKER_NAME = 'worker0' + + +def parse_json_trace(json_content, worker_name=WORKER_NAME) -> RunProfileData: + trace_json = json.loads(json_content) + trace_json = {'schemaVersion': 1, 'traceEvents': trace_json} + return RunProfileData.from_json(worker_name, 0, trace_json) + + +''' +All the events in json string are only simulation, not actual generated events. +We removed the data fields that not used by current version of our profiler, +for easy to check correctness and shorter in length. +We even renamed the data values such as kernel name or 'ts', to simplify the string. +''' + + +class TestProfiler(unittest.TestCase): + # A test case including all 7 event categories. + def test_all_categories(self): + json_content = """ + [{ + "ph": "X", "cat": "Operator", + "name": "enumerate(DataLoader)#_SingleProcessDataLoaderIter.__next__", "pid": 13721, "tid": "123", + "ts": 100, "dur": 180, + "args": {"Input Dims": [], "External id": 2} + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::to", "pid": 13721, "tid": "123", + "ts": 200, "dur": 60, + "args": {"Input Dims": [[2, 8, 5], [], [], [], [], [], [], []], "External id": 3} + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::nll_loss_backward", "pid": 13721, "tid": "456", + "ts": 340, "dur": 70, + "args": {"Input Dims": [[], [32, 1000], [32], [], [], [], []], "External id": 4} + }, + { + "ph": "X", "cat": "Operator", + "name": "ProfilerStep#1", "pid": 13721, "tid": "123", + "ts": 50, "dur": 400, + "args": {"Input Dims": [], "External id": 1} + }, + { + "ph": "X", "cat": "Memcpy", + "name": "Memcpy HtoD (Pageable -> Device)", "pid": 0, "tid": "stream 7", + "ts": 405, "dur": 10, + "args": {"stream": 7, "correlation": 334, "external id": 4} + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaMemcpyAsync", "pid": 13721, "tid": "456", + "ts": 360, "dur": 20, + "args": {"correlation": 334, "external id": 4} + }, + { + "ph": "X", "cat": "Memset", + "name": "Memset (Device)", "pid": 0, "tid": "stream 7", + "ts": 420, "dur": 5, + "args": {"stream": 7, "correlation": 40344, "external id": 4} + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaMemsetAsync", "pid": 13721, "tid": "456", + "ts": 390, "dur": 10, + "args": {"correlation": 40344, "external id": 4} + }, + { + "ph": "X", "cat": "Kernel", + "name": "void cunn_ClassNLLCriterion_updateGradInput_kernel", "pid": 0, "tid": "stream 7", + "ts": 430, "dur": 15, + "args": {"correlation": 40348, "external id": 4, "device": 0} + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 13721, "tid": "456", + "ts": 405, "dur": 5, + "args": {"correlation": 40348, "external id": 4} + }] + """ + profile = parse_json_trace(json_content) + profile.process() + + self.assertTrue(profile.has_runtime) + self.assertTrue(profile.has_kernel) + self.assertTrue(profile.has_memcpy_or_memset) + step = profile.steps_costs[0] + self.assertEqual(step.costs[ProfileRole.Kernel], 15) + self.assertEqual(step.costs[ProfileRole.Memcpy], 10) + self.assertEqual(step.costs[ProfileRole.Memset], 5) + self.assertEqual(step.costs[ProfileRole.Runtime], 30) + self.assertEqual(step.costs[ProfileRole.DataLoader], 180) + self.assertEqual(step.costs[ProfileRole.CpuOp], 35) + self.assertEqual(step.costs[ProfileRole.Other], 125) + + self.assertEqual(len(profile.op_list_groupby_name), 2) + self.assertEqual(len(profile.op_list_groupby_name_input), 2) + + def test_op_list(op_list): + op_count = 0 + for op_agg in op_list: + if op_agg.name == 'aten::to': + op_count += 1 + self.assertEqual(op_agg.input_shape, + '[[2, 8, 5], [], [], [], [], [], [], []]') + self.assertEqual(op_agg.calls, 1) + self.assertEqual(op_agg.host_duration, 60) + self.assertEqual(op_agg.device_duration, 0) + self.assertEqual(op_agg.self_host_duration, 60) + self.assertEqual(op_agg.self_device_duration, 0) + if op_agg.name == 'aten::nll_loss_backward': + op_count += 1 + self.assertEqual(op_agg.input_shape, + '[[], [32, 1000], [32], [], [], [], []]') + self.assertEqual(op_agg.calls, 1) + self.assertEqual(op_agg.host_duration, 70) + self.assertEqual(op_agg.device_duration, 30) + self.assertEqual( + op_agg.self_host_duration, 70 - 20 - 10 - 5) + self.assertEqual(op_agg.self_device_duration, 30) + self.assertEqual(op_count, 2) + + test_op_list(profile.op_list_groupby_name) + test_op_list(profile.op_list_groupby_name_input) + + self.assertEqual(len(profile.kernel_list_groupby_name_op), 1) + self.assertEqual(profile.kernel_stat.shape[0], 1) + self.assertEqual(profile.kernel_list_groupby_name_op[0].name, + 'void cunn_ClassNLLCriterion_updateGradInput_kernel') + self.assertEqual( + profile.kernel_list_groupby_name_op[0].op_name, 'aten::nll_loss_backward') + self.assertEqual(profile.kernel_list_groupby_name_op[0].calls, 1) + self.assertEqual( + profile.kernel_list_groupby_name_op[0].total_duration, 15) + self.assertEqual( + profile.kernel_list_groupby_name_op[0].min_duration, 15) + self.assertEqual( + profile.kernel_list_groupby_name_op[0].max_duration, 15) + self.assertEqual(profile.kernel_stat.iloc[0]['count'], 1) + self.assertEqual(profile.kernel_stat.iloc[0]['sum'], 15) + self.assertEqual(profile.kernel_stat.iloc[0]['mean'], 15) + self.assertEqual(profile.kernel_stat.iloc[0]['min'], 15) + self.assertEqual(profile.kernel_stat.iloc[0]['max'], 15) + + # Test using external_id to build relationship between Operator and Runtime. + # Use external_id to build correlation with its father OperatorNode or ProfilerStepNode. + # Because in the case when RuntimeNode has duration 0 and starts at same time as a OperatorNode, + # just use interval containing relationship can't tell it is child or brother of the OperatorNode. + def test_external_id(self): + json_content = """ + [{ + "ph": "X", "cat": "Operator", + "name": "aten::mat_mul", "pid": 13721, "tid": "456", + "ts": 100, "dur": 100, + "args": {"Input Dims": [], "External id": 2} + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::mm", "pid": 13721, "tid": "456", + "ts": 120, "dur": 70, + "args": {"Input Dims": [], "External id": 4} + }, + { + "ph": "X", "cat": "Kernel", + "name": "void cunn_ClassNLLCriterion_updateGradInput_kernel", "pid": 0, "tid": "stream 7", + "ts": 130, "dur": 5, + "args": {"correlation": 334, "external id": 4, "device": 0} + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 13721, "tid": "456", + "ts": 120, "dur": 0, + "args": {"correlation": 334, "external id": 4} + }, + { + "ph": "X", "cat": "Kernel", + "name": "void cunn_ClassNLLCriterion_updateGradInput_kernel", "pid": 0, "tid": "stream 7", + "ts": 130, "dur": 6, + "args": {"correlation": 335, "external id": 2, "device": 0} + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 13721, "tid": "456", + "ts": 120, "dur": 0, + "args": {"correlation": 335, "external id": 2} + }, + { + "ph": "X", "cat": "Kernel", + "name": "void cunn_ClassNLLCriterion_updateGradInput_kernel", "pid": 0, "tid": "stream 7", + "ts": 130, "dur": 7, + "args": {"correlation": 336, "external id": 4, "device": 0} + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 13721, "tid": "456", + "ts": 190, "dur": 0, + "args": {"correlation": 336, "external id": 4} + }, + { + "ph": "X", "cat": "Kernel", + "name": "void cunn_ClassNLLCriterion_updateGradInput_kernel", "pid": 0, "tid": "stream 7", + "ts": 130, "dur": 8, + "args": {"correlation": 337, "external id": 2, "device": 0} + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 13721, "tid": "456", + "ts": 190, "dur": 0, + "args": {"correlation": 337, "external id": 2} + }] + """ + profile = parse_json_trace(json_content) + profile.process() + + op_count = 0 + for op_agg in profile.op_list_groupby_name: + if op_agg.name == 'aten::mat_mul': + op_count += 1 + self.assertEqual(op_agg.device_duration, 5 + 6 + 7 + 8) + self.assertEqual(op_agg.self_device_duration, 6 + 8) + if op_agg.name == 'aten::mm': + op_count += 1 + self.assertEqual(op_agg.device_duration, 5 + 7) + self.assertEqual(op_agg.self_device_duration, 5 + 7) + self.assertEqual(op_count, 2) + + # Test operator's father-child relationship when they have same start time or end time. + def test_operator_relation(self): + # 2 events with same start time. + json_content = """ + [{ + "ph": "X", "cat": "Operator", + "name": "aten::mat_mul", "pid": 13721, "tid": "456", + "ts": 100, "dur": 100, + "args": {"Input Dims": [], "External id": 2} + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::mm", "pid": 13721, "tid": "456", + "ts": 100, "dur": 70, + "args": {"Input Dims": [], "External id": 4} + }] + """ + profile = parse_json_trace(json_content) + profile.process() + op_count = 0 + for op_agg in profile.op_list_groupby_name: + if op_agg.name == 'aten::mat_mul': + op_count += 1 + self.assertEqual(op_agg.self_host_duration, 100 - 70) + if op_agg.name == 'aten::mm': + op_count += 1 + self.assertEqual(op_agg.self_host_duration, 70) + self.assertEqual(op_count, 2) + + # 2 events with same end time. + json_content = """ + [{ + "ph": "X", "cat": "Operator", + "name": "aten::mat_mul", "pid": 13721, "tid": "456", + "ts": 100, "dur": 100, + "args": {"Input Dims": [], "External id": 2} + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::mm", "pid": 13721, "tid": "456", + "ts": 130, "dur": 70, + "args": {"Input Dims": [], "External id": 4} + }] + """ + profile = parse_json_trace(json_content) + profile.process() + op_count = 0 + for op_agg in profile.op_list_groupby_name: + if op_agg.name == 'aten::mat_mul': + op_count += 1 + self.assertEqual(op_agg.self_host_duration, 100 - 70) + if op_agg.name == 'aten::mm': + op_count += 1 + self.assertEqual(op_agg.self_host_duration, 70) + self.assertEqual(op_count, 2) + + # Test multiple father-child operators with same name. + # In this case, all the operators except the top operator should be removed, + # and all runtime/kernels belong to the children operators should be attached to the only kept one. + # This behavior is to keep consistent with _remove_dup_nodes in torch/autograd/profiler.py. + def test_remove_dup_nodes(self): + json_content = """[ + { + "ph": "X", "cat": "Operator", + "name": "aten::mm", "pid": 13721, "tid": "456", + "ts": 100, "dur": 100, + "args": {"Input Dims": [], "External id": 2} + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::mm", "pid": 13721, "tid": "456", + "ts": 110, "dur": 80, + "args": {"Input Dims": [], "External id": 3} + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::mm", "pid": 13721, "tid": "456", + "ts": 120, "dur": 60, + "args": {"Input Dims": [], "External id": 4} + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 13721, "tid": "456", + "ts": 130, "dur": 20, + "args": {"correlation": 335, "external id": 4} + }, + { + "ph": "X", "cat": "Kernel", + "name": "void gemmSN_TN_kernel_64addr", "pid": 0, "tid": "stream 7", + "ts": 220, "dur": 8, + "args": {"correlation": 335, "external id": 4, "device": 0} + } + ] + """ + profile = parse_json_trace(json_content) + profile.process() + self.assertEqual(len(profile.op_list_groupby_name), 1) + self.assertEqual( + profile.op_list_groupby_name[0].self_device_duration, 8) + + # Test Runtime with 'external id' 0. + # This kind of Runtime should not be attached to any operator, + # and should be included in accumulating device time. + def test_top_level_runtime(self): + # This operator is different thread with the runtime. + json_content = """[ + { + "ph": "X", "cat": "Operator", + "name": "aten::mm", "pid": 13721, "tid": "123", + "ts": 100, "dur": 100, + "args": {"Input Dims": [], "External id": 2} + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 13721, "tid": "456", + "ts": 130, "dur": 20, + "args": {"correlation": 335, "external id": 0} + }, + { + "ph": "X", "cat": "Kernel", + "name": "void gemmSN_TN_kernel_64addr", "pid": 0, "tid": "stream 7", + "ts": 220, "dur": 8, + "args": {"correlation": 335, "external id": 0, "device": 0} + } + ] + """ + profile = parse_json_trace(json_content) + profile.process() + self.assertEqual(profile.op_list_groupby_name[0].device_duration, 0) + self.assertEqual( + profile.op_list_groupby_name[0].self_device_duration, 0) + self.assertEqual(profile.kernel_stat.iloc[0]['count'], 1) + + # Test Runtime directly called in ProfilerStep, not inside any operator. + def test_runtime_called_by_profilerstep(self): + json_content = """[ + { + "ph": "X", "cat": "Operator", + "name": "ProfilerStep#1", "pid": 13721, "tid": "456", + "ts": 100, "dur": 300, + "args": {"Input Dims": [], "External id": 2} + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 13721, "tid": "456", + "ts": 130, "dur": 20, + "args": {"correlation": 335, "external id": 2} + }, + { + "ph": "X", "cat": "Kernel", + "name": "void gemmSN_TN_kernel_64addr", "pid": 0, "tid": "stream 7", + "ts": 220, "dur": 8, + "args": {"correlation": 335, "external id": 2, "device": 0} + } + ] + """ + profile = parse_json_trace(json_content) + profile.process() + step = profile.steps_costs[0] + self.assertEqual(step.costs[ProfileRole.Kernel], 8) + self.assertEqual(step.costs[ProfileRole.Runtime], 20) + self.assertEqual(step.costs[ProfileRole.CpuOp], 0) + self.assertEqual(step.costs[ProfileRole.Other], 300 - 8 - 20) + # ProfilerStep is not regarded as an operator. + self.assertEqual(len(profile.op_list_groupby_name), 0) + self.assertEqual(len(profile.op_list_groupby_name_input), 0) + self.assertEqual(profile.kernel_stat.iloc[0]['count'], 1) + self.assertEqual(len(profile.kernel_list_groupby_name_op), 1) + + # Test one Runtime lauch more than one Kernels. + # Sometimes such as running Bert using DataParallel mode(1 process, 2GPUs), + # one runtime such as cudaLaunchCooperativeKernelMultiDevice could trigger more than one kernel, + # each Kernel runs at a seperate GPU card. + def test_runtime_launch_multipe_kernels(self): + json_content = """[ + { + "ph": "X", "cat": "Operator", + "name": "Broadcast", "pid": 13721, "tid": "456", + "ts": 100, "dur": 300, + "args": {"Input Dims": [], "External id": 2} + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchCooperativeKernelMultiDevice", "pid": 13721, "tid": "456", + "ts": 130, "dur": 20, + "args": {"correlation": 335, "external id": 2} + }, + { + "ph": "X", "cat": "Kernel", + "name": "ncclBroadcastRingLLKernel_copy_i8(ncclColl)", "pid": 0, "tid": "stream 13", + "ts": 160, "dur": 120318, + "args": {"device": 0, "context": 1, "stream": 13, + "correlation": 335, "external id": 2, "device": 0} + }, + { + "ph": "X", "cat": "Kernel", + "name": "ncclBroadcastRingLLKernel_copy_i8(ncclColl)", "pid": 0, "tid": "stream 22", + "ts": 170, "dur": 132800, + "args": {"device": 0, "context": 2, "stream": 22, + "correlation": 335, "external id": 2} + } + ] + """ + profile = parse_json_trace(json_content) + profile.process() + self.assertEqual( + profile.op_list_groupby_name[0].device_duration, 120318 + 132800) + self.assertEqual(profile.kernel_stat.iloc[0]['count'], 2) + self.assertEqual(len(profile.kernel_list_groupby_name_op), 1) + + # Test when there is no ProfilerStep#. + def test_no_profilerstep(self): + json_content = """[ + { + "ph": "X", "cat": "Operator", + "name": "aten::to", "pid": 13721, "tid": "123", + "ts": 100, "dur": 60, + "args": {"Input Dims": [[2, 8, 5], [], [], [], [], [], [], []], "External id": 3} + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::nll_loss_backward", "pid": 13721, "tid": "456", + "ts": 300, "dur": 70, + "args": {"Input Dims": [[], [32, 1000], [32], [], [], [], []], "External id": 4} + }, + { + "ph": "X", "cat": "Kernel", + "name": "void cunn_ClassNLLCriterion_updateGradInput_kernel", "pid": 0, "tid": "stream 7", + "ts": 320, "dur": 100, + "args": {"correlation": 40348, "external id": 4, "device": 0} + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 13721, "tid": "456", + "ts": 310, "dur": 20, + "args": {"correlation": 40348, "external id": 4} + } + ] + """ + profile = parse_json_trace(json_content) + profile.process() + + self.assertTrue(profile.has_runtime) + self.assertTrue(profile.has_kernel) + self.assertTrue(not profile.has_memcpy_or_memset) + self.assertEqual(len(profile.steps_costs), 1) + step = profile.steps_costs[0] + + self.assertEqual(step.costs[ProfileRole.Kernel], 100) + self.assertEqual(step.costs[ProfileRole.Memcpy], 0) + self.assertEqual(step.costs[ProfileRole.Memset], 0) + self.assertEqual(step.costs[ProfileRole.Runtime], 320 - 310) + self.assertEqual(step.costs[ProfileRole.DataLoader], 0) + self.assertEqual(step.costs[ProfileRole.CpuOp], 60 + (310 - 300)) + # If no ProfilerStep, all events will be regarded as a step. + self.assertEqual(step.costs[ProfileRole.Other], 300 - (100 + 60)) + self.assertEqual(step.costs[ProfileRole.Total], (320 + 100) - 100) + self.assertEqual(len(profile.op_list_groupby_name), 2) + self.assertEqual(len(profile.op_list_groupby_name_input), 2) + self.assertEqual(profile.kernel_stat.iloc[0]['count'], 1) + self.assertEqual(len(profile.kernel_list_groupby_name_op), 1) + + def test_op_list(op_list): + op_count = 0 + for op_agg in op_list: + if op_agg.name == 'aten::to': + op_count += 1 + self.assertEqual(op_agg.input_shape, + '[[2, 8, 5], [], [], [], [], [], [], []]') + self.assertEqual(op_agg.calls, 1) + self.assertEqual(op_agg.host_duration, 60) + self.assertEqual(op_agg.device_duration, 0) + self.assertEqual(op_agg.self_host_duration, 60) + self.assertEqual(op_agg.self_device_duration, 0) + if op_agg.name == 'aten::nll_loss_backward': + op_count += 1 + self.assertEqual(op_agg.input_shape, + '[[], [32, 1000], [32], [], [], [], []]') + self.assertEqual(op_agg.calls, 1) + self.assertEqual(op_agg.host_duration, 70) + self.assertEqual(op_agg.device_duration, 100) + self.assertEqual(op_agg.self_host_duration, 70 - 20) + self.assertEqual(op_agg.self_device_duration, 100) + self.assertEqual(op_count, 2) + + test_op_list(profile.op_list_groupby_name) + test_op_list(profile.op_list_groupby_name_input) + + self.assertEqual(profile.kernel_list_groupby_name_op[0].name, + 'void cunn_ClassNLLCriterion_updateGradInput_kernel') + self.assertEqual( + profile.kernel_list_groupby_name_op[0].op_name, 'aten::nll_loss_backward') + self.assertEqual(profile.kernel_list_groupby_name_op[0].calls, 1) + self.assertEqual( + profile.kernel_list_groupby_name_op[0].total_duration, 100) + self.assertEqual( + profile.kernel_list_groupby_name_op[0].min_duration, 100) + self.assertEqual( + profile.kernel_list_groupby_name_op[0].max_duration, 100) + self.assertEqual(profile.kernel_stat.iloc[0]['count'], 1) + self.assertEqual(profile.kernel_stat.iloc[0]['sum'], 100) + self.assertEqual(profile.kernel_stat.iloc[0]['mean'], 100) + self.assertEqual(profile.kernel_stat.iloc[0]['min'], 100) + self.assertEqual(profile.kernel_stat.iloc[0]['max'], 100) + + # 2 steps without overlap with each other. + def test_multiple_profilersteps_no_overlap(self): + json_content = """ + [{ + "ph": "X", "cat": "Operator", + "name": "ProfilerStep#1", "pid": 13721, "tid": "123", + "ts": 100, "dur": 200, + "args": {"Input Dims": [], "External id": 1} + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::to", "pid": 13721, "tid": "123", + "ts": 200, "dur": 60, + "args": {"Input Dims": [[2, 8, 5], [], [], [], [], [], [], []], "External id": 2} + }, + { + "ph": "X", "cat": "Operator", + "name": "ProfilerStep#2", "pid": 13721, "tid": "123", + "ts": 350, "dur": 150, + "args": {"Input Dims": [], "External id": 3} + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::mm", "pid": 13721, "tid": "123", + "ts": 360, "dur": 50, + "args": {"Input Dims": [], "External id": 4} + }, + { + "ph": "X", "cat": "Memcpy", + "name": "Memcpy HtoD (Pageable -> Device)", "pid": 0, "tid": "stream 7", + "ts": 280, "dur": 40, + "args": {"stream": 7, "correlation": 334, "external id": 2} + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaMemcpyAsync", "pid": 13721, "tid": "123", + "ts": 250, "dur": 5, + "args": {"correlation": 334, "external id": 2} + }, + { + "ph": "X", "cat": "Kernel", + "name": "void cunn_ClassNLLCriterion_updateGradInput_kernel", "pid": 0, "tid": "stream 7", + "ts": 410, "dur": 200, + "args": {"correlation": 40348, "external id": 4, "device": 0} + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 13721, "tid": "123", + "ts": 400, "dur": 5, + "args": {"correlation": 40348, "external id": 4} + }] + """ + profile = parse_json_trace(json_content) + profile.process() + + self.assertTrue(profile.has_runtime) + self.assertTrue(profile.has_kernel) + self.assertTrue(profile.has_memcpy_or_memset) + self.assertEqual(len(profile.steps_costs), 2) + step = profile.steps_costs[0] + self.assertEqual(step.costs[ProfileRole.Kernel], 0) + self.assertEqual(step.costs[ProfileRole.Memcpy], 40) + self.assertEqual(step.costs[ProfileRole.Memset], 0) + self.assertEqual(step.costs[ProfileRole.Runtime], 5) + self.assertEqual(step.costs[ProfileRole.DataLoader], 0) + self.assertEqual(step.costs[ProfileRole.CpuOp], 60 - 5) + self.assertEqual(step.costs[ProfileRole.Other], 200 - 60 - 20) + # Device side takes effect. + self.assertEqual(step.costs[ProfileRole.Total], 320 - 100) + step = profile.steps_costs[1] + self.assertEqual(step.costs[ProfileRole.Kernel], 200) + self.assertEqual(step.costs[ProfileRole.Memcpy], 0) + self.assertEqual(step.costs[ProfileRole.Memset], 0) + self.assertEqual(step.costs[ProfileRole.Runtime], 5) + self.assertEqual(step.costs[ProfileRole.DataLoader], 0) + self.assertEqual(step.costs[ProfileRole.CpuOp], 50 - 5) + self.assertEqual(step.costs[ProfileRole.Other], 360 - 350) + # Device side takes effect. + self.assertEqual(step.costs[ProfileRole.Total], 610 - 350) + self.assertEqual( + profile.avg_costs.costs[ProfileRole.Total], ((320 - 100) + (610 - 350)) / 2) + + self.assertEqual(len(profile.op_list_groupby_name), 2) + self.assertEqual(len(profile.op_list_groupby_name_input), 2) + + def test_op_list(op_list): + op_count = 0 + for op_agg in op_list: + if op_agg.name == 'aten::to': + op_count += 1 + self.assertEqual(op_agg.input_shape, + '[[2, 8, 5], [], [], [], [], [], [], []]') + self.assertEqual(op_agg.calls, 1) + self.assertEqual(op_agg.host_duration, 60) + self.assertEqual(op_agg.device_duration, 40) + self.assertEqual(op_agg.self_host_duration, 60 - 5) + self.assertEqual(op_agg.self_device_duration, 40) + if op_agg.name == 'aten::mm': + op_count += 1 + self.assertEqual(op_agg.input_shape, '[]') + self.assertEqual(op_agg.calls, 1) + self.assertEqual(op_agg.host_duration, 50) + self.assertEqual(op_agg.device_duration, 200) + self.assertEqual(op_agg.self_host_duration, 50 - 5) + self.assertEqual(op_agg.self_device_duration, 200) + self.assertEqual(op_count, 2) + + test_op_list(profile.op_list_groupby_name) + test_op_list(profile.op_list_groupby_name_input) + + self.assertEqual(len(profile.kernel_list_groupby_name_op), 1) + self.assertEqual(profile.kernel_stat.shape[0], 1) + self.assertEqual(profile.kernel_list_groupby_name_op[0].name, + 'void cunn_ClassNLLCriterion_updateGradInput_kernel') + self.assertEqual( + profile.kernel_list_groupby_name_op[0].op_name, 'aten::mm') + self.assertEqual(profile.kernel_list_groupby_name_op[0].calls, 1) + self.assertEqual( + profile.kernel_list_groupby_name_op[0].total_duration, 200) + self.assertEqual( + profile.kernel_list_groupby_name_op[0].min_duration, 200) + self.assertEqual( + profile.kernel_list_groupby_name_op[0].max_duration, 200) + self.assertEqual(profile.kernel_stat.iloc[0]['count'], 1) + self.assertEqual(profile.kernel_stat.iloc[0]['sum'], 200) + self.assertEqual(profile.kernel_stat.iloc[0]['mean'], 200) + self.assertEqual(profile.kernel_stat.iloc[0]['min'], 200) + self.assertEqual(profile.kernel_stat.iloc[0]['max'], 200) + + # Test self time and total time on operator with nested operator. + def test_self_time(self): + json_content = """ + [{ + "ph": "X", "cat": "Operator", + "name": "aten::mat_mul", "pid": 13721, "tid": "456", + "ts": 100, "dur": 100, + "args": {"Input Dims": [], "External id": 2} + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::mm", "pid": 13721, "tid": "456", + "ts": 120, "dur": 40, + "args": {"Input Dims": [], "External id": 4} + }, + { + "ph": "X", "cat": "Kernel", + "name": "void cunn_ClassNLLCriterion_updateGradInput_kernel", "pid": 0, "tid": "stream 7", + "ts": 155, "dur": 20, + "args": {"correlation": 334, "external id": 4, "device": 0} + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 13721, "tid": "456", + "ts": 150, "dur": 10, + "args": {"correlation": 334, "external id": 4} + }, + { + "ph": "X", "cat": "Kernel", + "name": "void cunn_ClassNLLCriterion_updateGradInput_kernel", "pid": 0, "tid": "stream 7", + "ts": 210, "dur": 16, + "args": {"correlation": 335, "external id": 2, "device": 0} + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 13721, "tid": "456", + "ts": 170, "dur": 25, + "args": {"correlation": 335, "external id": 2} + }] + """ + profile = parse_json_trace(json_content) + + op_count = 0 + for op_agg in profile.op_list_groupby_name: + if op_agg.name == 'aten::mat_mul': + op_count += 1 + self.assertEqual(op_agg.host_duration, 100) + self.assertEqual(op_agg.device_duration, 20 + 16) + self.assertEqual(op_agg.self_host_duration, 100 - 40 - 25) + self.assertEqual(op_agg.self_device_duration, 16) + if op_agg.name == 'aten::mm': + op_count += 1 + self.assertEqual(op_agg.host_duration, 40) + self.assertEqual(op_agg.device_duration, 20) + self.assertEqual(op_agg.self_host_duration, 30) + self.assertEqual(op_agg.self_device_duration, 20) + self.assertEqual(op_count, 2) + + # 2 steps with overlap with each other. + def test_multiple_profilersteps_with_overlap(self): + # The kernel with 'correlation' as 123 is launched by previous step, + # its end time is bigger than 'ProfilerStep#1''s start time, + # so it is regarded as beginning of 'ProfilerStep#1'. + # The memcpy with 'correlation' as 334 is launched by 'ProfilerStep#1', + # its end time is bigger than 'ProfilerStep#2''s start time, + # so it is regarded as beginning of 'ProfilerStep#2'. + json_content = """ + [{ + "ph": "X", "cat": "Operator", + "name": "ProfilerStep#1", "pid": 13721, "tid": "123", + "ts": 100, "dur": 200, + "args": {"Input Dims": [], "External id": 1} + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::to", "pid": 13721, "tid": "123", + "ts": 200, "dur": 60, + "args": {"Input Dims": [[2, 8, 5], [], [], [], [], [], [], []], "External id": 2} + }, + { + "ph": "X", "cat": "Operator", + "name": "ProfilerStep#2", "pid": 13721, "tid": "123", + "ts": 350, "dur": 150, + "args": {"Input Dims": [], "External id": 3} + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::mm", "pid": 13721, "tid": "123", + "ts": 360, "dur": 50, + "args": {"Input Dims": [], "External id": 4} + }, + { + "ph": "X", "cat": "Kernel", + "name": "void cunn_ClassNLLCriterion_updateGradInput_kernel", "pid": 0, "tid": "stream 7", + "ts": 150, "dur": 90, + "args": {"correlation": 123, "external id": 0, "device": 0} + }, + { + "ph": "X", "cat": "Memcpy", + "name": "Memcpy HtoD (Pageable -> Device)", "pid": 0, "tid": "stream 7", + "ts": 280, "dur": 100, + "args": {"stream": 7, "correlation": 334, "external id": 2} + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaMemcpyAsync", "pid": 13721, "tid": "123", + "ts": 250, "dur": 5, + "args": {"correlation": 334, "external id": 2} + }, + { + "ph": "X", "cat": "Kernel", + "name": "void cunn_ClassNLLCriterion_updateGradInput_kernel", "pid": 0, "tid": "stream 7", + "ts": 410, "dur": 200, + "args": {"correlation": 40348, "external id": 4, "device": 0} + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 13721, "tid": "123", + "ts": 400, "dur": 5, + "args": {"correlation": 40348, "external id": 4} + }] + """ + profile = parse_json_trace(json_content) + profile.process() + + self.assertTrue(profile.has_runtime) + self.assertTrue(profile.has_kernel) + self.assertTrue(profile.has_memcpy_or_memset) + self.assertEqual(len(profile.steps_costs), 2) + step = profile.steps_costs[0] + self.assertEqual(step.costs[ProfileRole.Kernel], 0) + self.assertEqual(step.costs[ProfileRole.Memcpy], 100) + self.assertEqual(step.costs[ProfileRole.Memset], 0) + self.assertEqual(step.costs[ProfileRole.Runtime], 5) + self.assertEqual(step.costs[ProfileRole.DataLoader], 0) + self.assertEqual(step.costs[ProfileRole.CpuOp], + (200 + 60) - (150 + 90) - 5) + self.assertEqual(step.costs[ProfileRole.Other], 280 - (200 + 60)) + # Device side takes effect. + self.assertEqual(step.costs[ProfileRole.Total], + (280 + 100) - (150 + 90)) + step = profile.steps_costs[1] + self.assertEqual(step.costs[ProfileRole.Kernel], 200) + self.assertEqual(step.costs[ProfileRole.Memcpy], 0) + self.assertEqual(step.costs[ProfileRole.Memset], 0) + self.assertEqual(step.costs[ProfileRole.Runtime], 5) + self.assertEqual(step.costs[ProfileRole.DataLoader], 0) + self.assertEqual(step.costs[ProfileRole.CpuOp], + (280 + 100) - 360 + (410 - 405)) + self.assertEqual(step.costs[ProfileRole.Other], 0) + # Device side takes effect. + self.assertEqual(step.costs[ProfileRole.Total], 610 - (280 + 100)) + + # Test whether step time is calculated correctly when the last 2 steps have no kernels launched. + def test_last_steps_no_kernel(self): + json_content = """ + [{ + "ph": "X", "cat": "Operator", + "name": "ProfilerStep#1", "pid": 13721, "tid": "123", + "ts": 100, "dur": 200, + "args": {"Input Dims": [], "External id": 1} + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::to", "pid": 13721, "tid": "123", + "ts": 120, "dur": 10, + "args": {"Input Dims": [[2, 8, 5], [], [], [], [], [], [], []], "External id": 2} + }, + { + "ph": "X", "cat": "Operator", + "name": "ProfilerStep#2", "pid": 13721, "tid": "123", + "ts": 300, "dur": 100, + "args": {"Input Dims": [], "External id": 3} + }, + { + "ph": "X", "cat": "Operator", + "name": "ProfilerStep#3", "pid": 13721, "tid": "123", + "ts": 400, "dur": 50, + "args": {"Input Dims": [], "External id": 4} + }, + { + "ph": "X", "cat": "Kernel", + "name": "void cunn_ClassNLLCriterion_updateGradInput_kernel", "pid": 0, "tid": "stream 7", + "ts": 90, "dur": 20, + "args": {"correlation": 123, "external id": 0, "device": 0} + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaMemcpyAsync", "pid": 13721, "tid": "123", + "ts": 125, "dur": 5, + "args": {"correlation": 334, "external id": 2} + }, + { + "ph": "X", "cat": "Kernel", + "name": "void cunn_ClassNLLCriterion_updateGradInput_kernel", "pid": 0, "tid": "stream 7", + "ts": 150, "dur": 180, + "args": {"correlation": 334, "external id": 2, "device": 0} + }] + """ + profile = parse_json_trace(json_content) + profile.process() + + # The last 2 steps without kernels are removed from overall view. + self.assertEqual(len(profile.steps_costs), 1) + step = profile.steps_costs[0] + self.assertEqual( + step.costs[ProfileRole.Total], (150 + 180) - (90 + 20)) + + def test_pure_cpu(self): + json_content = """ + [{ + "ph": "X", "cat": "Operator", + "name": "ProfilerStep#1", "pid": 13721, "tid": "123", + "ts": 100, "dur": 200, + "args": {"Input Dims": [], "External id": 1} + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::to", "pid": 13721, "tid": "123", + "ts": 120, "dur": 10, + "args": {"Input Dims": [[2, 8, 5], [], [], [], [], [], [], []], "External id": 2} + }, + { + "ph": "X", "cat": "Operator", + "name": "ProfilerStep#2", "pid": 13721, "tid": "123", + "ts": 300, "dur": 100, + "args": {"Input Dims": [], "External id": 3} + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::mm", "pid": 13721, "tid": "123", + "ts": 350, "dur": 40, + "args": {"Input Dims": [], "External id": 4} + }] + """ + profile = parse_json_trace(json_content) + profile.process() + + self.assertEqual(len(profile.steps_costs), 2) + step = profile.steps_costs[0] + self.assertEqual(step.costs[ProfileRole.Kernel], 0) + self.assertEqual(step.costs[ProfileRole.Memcpy], 0) + self.assertEqual(step.costs[ProfileRole.Memset], 0) + self.assertEqual(step.costs[ProfileRole.Runtime], 0) + self.assertEqual(step.costs[ProfileRole.DataLoader], 0) + self.assertEqual(step.costs[ProfileRole.CpuOp], 10) + self.assertEqual(step.costs[ProfileRole.Other], 200 - 10) + self.assertEqual(step.costs[ProfileRole.Total], 200) + step = profile.steps_costs[1] + self.assertEqual(step.costs[ProfileRole.Kernel], 0) + self.assertEqual(step.costs[ProfileRole.Memcpy], 0) + self.assertEqual(step.costs[ProfileRole.Memset], 0) + self.assertEqual(step.costs[ProfileRole.Runtime], 0) + self.assertEqual(step.costs[ProfileRole.DataLoader], 0) + self.assertEqual(step.costs[ProfileRole.CpuOp], 40) + self.assertEqual(step.costs[ProfileRole.Other], 100 - 40) + self.assertEqual(step.costs[ProfileRole.Total], 100) + + # Test GPU utilization, est. SM efficiency, and occupancy. + def test_gpu_utilization(self): + json_content = """ + [{ + "ph": "X", "cat": "Operator", + "name": "aten::mat_mul", "pid": 13721, "tid": "456", + "ts": 100, "dur": 100, + "args": {"Input Dims": [], "External id": 2} + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::mm", "pid": 13721, "tid": "456", + "ts": 120, "dur": 70, + "args": {"Input Dims": [], "External id": 4} + }, + { + "ph": "X", "cat": "Kernel", + "name": "void cunn_ClassNLLCriterion_updateGradInput_kernel", "pid": 1, "tid": "stream 7", + "ts": 130, "dur": 10, + "args": {"correlation": 334, "external id": 4, "device": 1, + "blocks per SM": 0.5, "est. achieved occupancy %": 0.6} + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 13721, "tid": "456", + "ts": 120, "dur": 0, + "args": {"correlation": 334, "external id": 4} + }, + { + "ph": "X", "cat": "Kernel", + "name": "void gemmSN_TN_kernel_64addr", "pid": 1, "tid": "stream 8", + "ts": 135, "dur": 15, + "args": {"correlation": 335, "external id": 2, "device": 1, + "blocks per SM": 0.6, "est. achieved occupancy %": 0.1} + }, + { + "ph": "X", "cat": "Kernel", + "name": "void gemmSN_TN_kernel_64addr", "pid": 1, "tid": "stream 8", + "ts": 150, "dur": 0, + "args": {"correlation": 335, "external id": 2, "device": 1, + "blocks per SM": 0.3, "est. achieved occupancy %": 0.2} + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 13721, "tid": "456", + "ts": 120, "dur": 0, + "args": {"correlation": 335, "external id": 2} + }, + { + "ph": "X", "cat": "Kernel", + "name": "void cunn_ClassNLLCriterion_updateGradInput_kernel", "pid": 1, "tid": "stream 7", + "ts": 145, "dur": 25, + "args": {"correlation": 336, "external id": 4, "device": 1, + "blocks per SM": 0.3, "est. achieved occupancy %": 1.0} + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 13721, "tid": "456", + "ts": 125, "dur": 3, + "args": {"correlation": 336, "external id": 4} + }, + { + "ph": "X", "cat": "Kernel", + "name": "void cunn_ClassNLLCriterion_updateGradInput_kernel", "pid": 1, "tid": "stream 7", + "ts": 200, "dur": 20, + "args": {"correlation": 337, "external id": 2, "device": 1, + "blocks per SM": 10.5, "est. achieved occupancy %": 0.3} + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 13721, "tid": "456", + "ts": 195, "dur": 1, + "args": {"correlation": 337, "external id": 2} + }] + """ + profile = parse_json_trace(json_content) + profile.process() + + self.assertEqual(len(profile.gpu_metrics_parser.gpu_ids), 1) + self.assertAlmostEqual(profile.gpu_metrics_parser.gpu_utilization[1], (40 + 20) / 120) + self.assertAlmostEqual(profile.gpu_metrics_parser.avg_approximated_sm_efficiency_per_device[1], + (0.5 * (135 - 130) + + 1.0 * (140 - 135) + + 0.6 * (145 - 140) + + 0.9 * (150 - 145) + + 0.3 * (170 - 150) + + 1.0 * (220 - 200)) / (220 - 100)) + self.assertAlmostEqual(profile.gpu_metrics_parser.avg_occupancy_per_device[1], + (0.6 * 10 + 0.1 * 15 + 1.0 * 25 + 0.3 * 20) / (10 + 15 + 25 + 20)) + + gpu_util_expected = [(100, 0), (110, 0), (120, 0), (130, 1.0), (140, 1.0), (150, 1.0), (160, 1.0), + (170, 0), (180, 0), (190, 0), (200, 1.0), (210, 1.0), (220, 0)] + for gpu_id in profile.gpu_metrics_parser.gpu_ids: + buckets = profile.gpu_metrics_parser.gpu_util_buckets[gpu_id] + gpu_util_id = 0 + for b in buckets: + self.assertEqual(b[0], gpu_util_expected[gpu_util_id][0]) + self.assertAlmostEqual(b[1], gpu_util_expected[gpu_util_id][1]) + gpu_util_id += 1 + self.assertEqual(gpu_util_id, len(gpu_util_expected)) + + sm_efficiency_expected = [(130, 0.5), (135, 0), (135, 1.0), (140, 0), (140, 0.6), (145, 0), (145, 0.9), + (150, 0), (150, 0.3), (170, 0), (170, 0), (200, 0), (200, 1.0), (220, 0)] + for gpu_id in profile.gpu_metrics_parser.gpu_ids: + ranges = profile.gpu_metrics_parser.approximated_sm_efficiency_ranges[gpu_id] + sm_efficiency_id = 0 + for r in ranges: + self.assertEqual( + r[0], sm_efficiency_expected[sm_efficiency_id][0]) + self.assertAlmostEqual( + r[2], sm_efficiency_expected[sm_efficiency_id][1]) + sm_efficiency_id += 1 + self.assertEqual( + r[1], sm_efficiency_expected[sm_efficiency_id][0]) + self.assertAlmostEqual( + 0, sm_efficiency_expected[sm_efficiency_id][1]) + sm_efficiency_id += 1 + self.assertEqual(sm_efficiency_id, len(sm_efficiency_expected)) + + count = 0 + for agg_by_op in profile.kernel_list_groupby_name_op: + if agg_by_op.name == 'void gemmSN_TN_kernel_64addr' and agg_by_op.op_name == 'aten::mat_mul': + self.assertAlmostEqual(agg_by_op.avg_blocks_per_sm, 0.6) + self.assertAlmostEqual(agg_by_op.avg_occupancy, 0.1) + count += 1 + if agg_by_op.name == 'void cunn_ClassNLLCriterion_updateGradInput_kernel' and \ + agg_by_op.op_name == 'aten::mm': + self.assertAlmostEqual( + agg_by_op.avg_blocks_per_sm, (0.5 * 10 + 0.3 * 25) / (10 + 25)) + self.assertAlmostEqual( + agg_by_op.avg_occupancy, (0.6 * 10 + 1.0 * 25) / (10 + 25)) + count += 1 + if agg_by_op.name == 'void cunn_ClassNLLCriterion_updateGradInput_kernel' and \ + agg_by_op.op_name == 'aten::mat_mul': + self.assertAlmostEqual(agg_by_op.avg_blocks_per_sm, 10.5) + self.assertAlmostEqual(agg_by_op.avg_occupancy, 0.3) + count += 1 + self.assertEqual(count, 3) + + count = 0 + for _id, (name, row) in enumerate(profile.kernel_stat.iterrows()): + # The kernel with zero 'dur' should be ignored. + if name == 'void gemmSN_TN_kernel_64addr': + self.assertAlmostEqual(row['blocks_per_sm'], 0.6) + self.assertAlmostEqual(row['occupancy'], 0.1) + count += 1 + if name == 'void cunn_ClassNLLCriterion_updateGradInput_kernel': + self.assertAlmostEqual( + row['blocks_per_sm'], (0.5 * 10 + 0.3 * 25 + 10.5 * 20) / (10 + 25 + 20)) + self.assertAlmostEqual( + row['occupancy'], (0.6 * 10 + 1.0 * 25 + 0.3 * 20) / (10 + 25 + 20)) + count += 1 + self.assertEqual(count, 2) + + # Test GPU utilization 3 metrics works fine if kernel out of ProfilerStep. + def test_gpu_utilization_kernel_out_of_step(self): + json_content = """ + [{ + "ph": "X", "cat": "Operator", + "name": "aten::mat_mul", "pid": 13721, "tid": "456", + "ts": 10, "dur": 10, + "args": {"Input Dims": [], "External id": 1} + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::mm", "pid": 13721, "tid": "456", + "ts": 120, "dur": 70, + "args": {"Input Dims": [], "External id": 3} + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::mm", "pid": 13721, "tid": "456", + "ts": 220, "dur": 20, + "args": {"Input Dims": [], "External id": 4} + }, + { + "ph": "X", "cat": "Operator", + "name": "ProfilerStep#2", "pid": 13721, "tid": "456", + "ts": 100, "dur": 100, + "args": {"Input Dims": [], "External id": 2} + }, + { + "ph": "X", "cat": "Kernel", + "name": "void cunn_ClassNLLCriterion_updateGradInput_kernel", "pid": 1, "tid": "stream 7", + "ts": 60, "dur": 20, + "args": {"correlation": 334, "external id": 1, "device": 1, + "blocks per SM": 0.5, "est. achieved occupancy %": 0.6} + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 13721, "tid": "456", + "ts": 15, "dur": 5, + "args": {"correlation": 334, "external id": 1} + }, + { + "ph": "X", "cat": "Kernel", + "name": "void cunn_ClassNLLCriterion_updateGradInput_kernel", "pid": 1, "tid": "stream 7", + "ts": 240, "dur": 25, + "args": {"correlation": 337, "external id": 4, "device": 1, + "blocks per SM": 10.5, "est. achieved occupancy %": 0.3} + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 13721, "tid": "456", + "ts": 230, "dur": 10, + "args": {"correlation": 337, "external id": 4} + }] + """ + profile = parse_json_trace(json_content) + profile.process() + + self.assertEqual(len(profile.gpu_metrics_parser.gpu_ids), 1) + self.assertAlmostEqual(profile.gpu_metrics_parser.gpu_utilization[1], 0.0) + self.assertTrue(profile.gpu_metrics_parser.avg_approximated_sm_efficiency_per_device[1] is None) + self.assertTrue(profile.gpu_metrics_parser.avg_occupancy_per_device[1] is None) + self.assertTrue(profile.gpu_metrics_parser.blocks_per_sm_count[1] > 0) + self.assertTrue(profile.gpu_metrics_parser.occupancy_count[1] > 0) + + count = 0 + for agg_by_op in profile.kernel_list_groupby_name_op: + if agg_by_op.name == 'void cunn_ClassNLLCriterion_updateGradInput_kernel' \ + and agg_by_op.op_name == 'aten::mat_mul': + self.assertAlmostEqual(agg_by_op.avg_blocks_per_sm, 0.5) + self.assertAlmostEqual(agg_by_op.avg_occupancy, 0.6) + count += 1 + if agg_by_op.name == 'void cunn_ClassNLLCriterion_updateGradInput_kernel' and \ + agg_by_op.op_name == 'aten::mm': + self.assertAlmostEqual( + agg_by_op.avg_blocks_per_sm, 10.5) + self.assertAlmostEqual( + agg_by_op.avg_occupancy, 0.3) + count += 1 + self.assertEqual(count, 2) + + count = 0 + for _id, (name, row) in enumerate(profile.kernel_stat.iterrows()): + # The kernel with zero 'dur' should be ignored. + if name == 'void cunn_ClassNLLCriterion_updateGradInput_kernel': + self.assertAlmostEqual(row['blocks_per_sm'], (20 * 0.5 + 25 * 10.5) / (20 + 25)) + self.assertAlmostEqual(row['occupancy'], (20 * 0.6 + 25 * 0.3) / (20 + 25)) + count += 1 + self.assertEqual(count, 1) + + def test_dump_gpu_metrics(self): + profile = RunProfile('test_dump_gpu_metrics', None) + # Faked data for easy to see in UI. Real data values are 1/100 of these. + gpu_util_buckets = [[(1621401187223005, 0.0), (1621401187224005, 0.0), + (1621401187225005, 0.6), (1621401187226005, 0.5), + (1621401187227005, 0.6), (1621401187228005, 0.2), + (1621401187229005, 0.6), (1621401187230005, 0.1), + (1621401187231005, 0.5), (1621401187232005, 0.2), + (1621401187233005, 0.3), (1621401187234005, 0.4), + (1621401187235005, 0.4219409282700422), + (1621401187236901, 0)]] + # Faked data for easy to see in UI. Real data values are 1/10 of these. + approximated_sm_efficiency_ranges = \ + [[(1621401187225275, 1621401187225278, 0.25), (1621401187225530, 1621401187225532, 0.125), + (1621401187225820, 1621401187225821, 0.125), (1621401187226325, 1621401187226327, 0.25), + (1621401187226575, 1621401187226577, 0.125), (1621401187226912, 1621401187226913, 0.125), + (1621401187227092, 1621401187227094, 0.125), (1621401187227619, 1621401187227620, 0.125), + (1621401187227745, 1621401187227746, 0.125), (1621401187227859, 1621401187227860, 0.125), + (1621401187227973, 1621401187227974, 0.125), (1621401187228279, 1621401187228280, 0.125), + (1621401187228962, 1621401187228963, 0.125), (1621401187229153, 1621401187229155, 0.125), + (1621401187229711, 1621401187229715, 0.125), (1621401187230162, 1621401187230163, 0.125), + (1621401187231100, 1621401187231103, 0.125), (1621401187231692, 1621401187231694, 0.5), + (1621401187232603, 1621401187232604, 0.125), (1621401187232921, 1621401187232922, 0.125), + (1621401187233342, 1621401187233343, 0.125), (1621401187233770, 1621401187233772, 0.125), + (1621401187234156, 1621401187234159, 0.125), (1621401187234445, 1621401187234446, 0.125), + (1621401187235025, 1621401187235028, 0.125), (1621401187235555, 1621401187235556, 0.125), + (1621401187236158, 1621401187236159, 0.125), (1621401187236278, 1621401187236279, 0.125), + (1621401187236390, 1621401187236391, 0.125), (1621401187236501, 1621401187236502, 0.125)]] + + basedir = os.path.dirname(os.path.realpath(__file__)) + trace_json_flat_path = os.path.join(basedir, 'gpu_metrics_input.json') + gpu_metrics_parser = GPUMetricsParser() + gpu_metrics_parser.gpu_util_buckets = gpu_util_buckets + gpu_metrics_parser.approximated_sm_efficiency_ranges = approximated_sm_efficiency_ranges + profile.gpu_metrics = gpu_metrics_parser.get_gpu_metrics() + with open(trace_json_flat_path, 'rb') as file: + raw_data = file.read() + data_with_gpu_metrics_compressed = profile.append_gpu_metrics(raw_data) + data_with_gpu_metrics_flat = gzip.decompress( + data_with_gpu_metrics_compressed) + + trace_json_expected_path = os.path.join(basedir, 'gpu_metrics_expected.json') + with open(trace_json_expected_path, 'rb') as file: + data_expected = file.read() + + # Parse to json in order to ignore text format difference. + data_with_gpu_metrics_json = json.loads( + data_with_gpu_metrics_flat.decode('utf8')) + data_expected_json = json.loads(data_expected.decode('utf8')) + data_with_gpu_metrics_str = json.dumps( + data_with_gpu_metrics_json, sort_keys=True) + data_expected_str = json.dumps(data_expected_json, sort_keys=True) + + self.assertEqual(data_with_gpu_metrics_str, data_expected_str) + + try: + _ = json.loads(data_with_gpu_metrics_flat.decode('utf8')) + except Exception: + self.assertTrue( + False, 'The string fails to be parsed by json after appending gpu metrics.') + + def test_memory_view(self): + json_content = """[ + { + "ph": "X", "cat": "Operator", + "name": "aten::to", "pid": 13721, "tid": "123", + "ts": 10, "dur": 10, + "args": {"Input Dims": [], "External id": 2} + }, + { + "ph": "X", "cat": "Operator", + "name": "enumerate(DataLoader)#_SingleProcessDataLoaderIter.__next__", "pid": 13721, "tid": "123", + "ts": 100, "dur": 180, + "args": {"Input Dims": [], "External id": 2} + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::to", "pid": 13721, "tid": "123", + "ts": 200, "dur": 60, + "args": {"Input Dims": [[2, 8, 5], [], [], [], [], [], [], []], "External id": 3} + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::nll_loss_backward", "pid": 13721, "tid": "123", + "ts": 340, "dur": 70, + "args": {"Input Dims": [[], [32, 1000], [32], [], [], [], []], "External id": 4} + }, + { + "ph": "X", "cat": "Operator", + "name": "ProfilerStep#1", "pid": 13721, "tid": "123", + "ts": 50, "dur": 400, + "args": {"Input Dims": [], "External id": 1} + }, + { + "ph": "X", "cat": "Operator", + "name": "ProfilerStep#2", "pid": 13721, "tid": "123", + "ts": 500, "dur": 500, + "args": {"Input Dims": [], "External id": 1} + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::to", "pid": 13721, "tid": "123", + "ts": 510, "dur": 150, + "args": {"Input Dims": [[2, 8, 5], [], [], [], [], [], [], []], "External id": 3} + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::copy_", "pid": 13721, "tid": "123", + "ts": 520, "dur": 100, + "args": {"Input Dims": [[2, 8, 5], [], [], [], [], [], [], []], "External id": 3} + }, + + { + "ph": "X", "cat": "Operator", + "name": "aten::liner", "pid": 13721, "tid": "123", + "ts": 700, "dur": 100, + "args": {"Input Dims": [[2, 8, 5], [], [], [], [], [], [], []], "External id": 3} + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::t", "pid": 13721, "tid": "123", + "ts": 705, "dur": 40, + "args": {"Input Dims": [[2, 8, 5], [], [], [], [], [], [], []], "External id": 3} + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::transpose", "pid": 13721, "tid": "123", + "ts": 710, "dur": 30, + "args": {"Input Dims": [[2, 8, 5], [], [], [], [], [], [], []], "External id": 3} + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::tranas_stride", "pid": 13721, "tid": "123", + "ts": 720, "dur": 10, + "args": {"Input Dims": [[2, 8, 5], [], [], [], [], [], [], []], "External id": 3} + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::addmm", "pid": 13721, "tid": "123", + "ts": 750, "dur": 40, + "args": {"Input Dims": [[2, 8, 5], [], [], [], [], [], [], []], "External id": 3} + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::to", "pid": 13721, "tid": "123", + "ts": 900, "dur": 100, + "args": {"Input Dims": [[2, 8, 5], [], [], [], [], [], [], []], "External id": 3} + }, + { + "ph": "X", "cat": "Memcpy", + "name": "Memcpy HtoD (Pageable -> Device)", "pid": 0, "tid": "stream 7", + "ts": 405, "dur": 10, + "args": {"stream": 7, "correlation": 334, "external id": 4} + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaMemcpyAsync", "pid": 13721, "tid": "456", + "ts": 360, "dur": 20, + "args": {"correlation": 334, "external id": 4} + }, + { + "ph": "X", "cat": "Memset", + "name": "Memset (Device)", "pid": 0, "tid": "stream 7", + "ts": 420, "dur": 5, + "args": {"stream": 7, "correlation": 40344, "external id": 4} + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaMemsetAsync", "pid": 13721, "tid": "456", + "ts": 390, "dur": 10, + "args": {"correlation": 40344, "external id": 4} + }, + { + "ph": "X", "cat": "Kernel", + "name": "void cunn_ClassNLLCriterion_updateGradInput_kernel", "pid": 0, "tid": "stream 7", + "ts": 430, "dur": 15, + "args": {"correlation": 40348, "external id": 4, "device": 0} + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 13721, "tid": "456", + "ts": 405, "dur": 5, + "args": {"correlation": 40348, "external id": 4} + }, + + + { + "ph": "i", "s": "t", "name": "[memory]", + "pid": 13721, "tid": 123, + "ts": 90, + "args": { + "Device Type": 0, "Device Id": -1, "Addr": 90, "Bytes": 4 + } + }, + { + "ph": "i", "s": "t", "name": "[memory]", + "pid": 13721, "tid": 123, + "ts": 150, + "args": { + "Device Type": 0, "Device Id": -1, "Addr": 150, "Bytes": 4 + } + }, + { + "ph": "i", "s": "t", "name": "[memory]", + "pid": 13721, "tid": 123, + "ts": 200, + "args": { + "Device Type": 0, "Device Id": -1, "Addr": 200, "Bytes": 4 + } + }, + { + "ph": "i", "s": "t", "name": "[memory]", + "pid": 13721, "tid": 123, + "ts": 210, + "args": { + "Device Type": 1, "Device Id": 0, "Addr": 210, "Bytes": 4 + } + }, + { + "ph": "i", "s": "t", "name": "[memory]", + "pid": 13721, "tid": 123, + "ts": 265, + "args": { + "Device Type": 1, "Device Id": 0, "Addr": 265, "Bytes": 4 + } + }, + { + "ph": "i", "s": "t", "name": "[memory]", + "pid": 13721, "tid": 123, + "ts": 300, + "args": { + "Device Type": 1, "Device Id": 0, "Addr": 300, "Bytes": 4 + } + }, + { + "ph": "i", "s": "t", "name": "[memory]", + "pid": 13721, "tid": 123, + "ts": 350, + "args": { + "Device Type": 1, "Device Id": 0, "Addr": 350, "Bytes": 10 + } + }, + { + "ph": "i", "s": "t", "name": "[memory]", + "pid": 13721, "tid": 123, + "ts": 360, + "args": { + "Device Type": 1, "Device Id": 0, "Addr": 350, "Bytes": -10 + } + }, + { + "ph": "i", "s": "t", "name": "[memory]", + "pid": 13721, "tid": 123, + "ts": 450, + "args": { + "Device Type": 0, "Device Id": -1, "Addr": 450, "Bytes": 1000000 + } + }, + + { + "ph": "i", "s": "t", "name": "[memory]", + "pid": 13721, "tid": 123, + "ts": 515, + "args": { + "Device Type": 1, "Device Id": 0, "Addr": 515, "Bytes": 100 + } + }, + { + "ph": "i", "s": "t", "name": "[memory]", + "pid": 13721, "tid": 123, + "ts": 520, + "args": { + "Device Type": 1, "Device Id": 0, "Addr": 520, "Bytes": 100 + } + }, + { + "ph": "i", "s": "t", "name": "[memory]", + "pid": 13721, "tid": 123, + "ts": 600, + "args": { + "Device Type": 1, "Device Id": 0, "Addr": 520, "Bytes": -100 + } + }, + { + "ph": "i", "s": "t", "name": "[memory]", + "pid": 13721, "tid": 123, + "ts": 690, + "args": { + "Device Type": 1, "Device Id": 0, "Addr": 690, "Bytes": 100 + } + }, + { + "ph": "i", "s": "t", "name": "[memory]", + "pid": 13721, "tid": 123, + "ts": 701, + "args": { + "Device Type": 1, "Device Id": 0, "Addr": 701, "Bytes": 100 + } + }, + { + "ph": "i", "s": "t", "name": "[memory]", + "pid": 13721, "tid": 123, + "ts": 796, + "args": { + "Device Type": 1, "Device Id": 0, "Addr": 515, "Bytes": -100 + } + }, + + { + "ph": "i", "s": "t", "name": "[memory]", + "pid": 13721, "tid": 123, + "ts": 708, + "args": { + "Device Type": 1, "Device Id": 0, "Addr": 708, "Bytes": 100 + } + }, + { + "ph": "i", "s": "t", "name": "[memory]", + "pid": 13721, "tid": 123, + "ts": 742, + "args": { + "Device Type": 1, "Device Id": 0, "Addr": 708, "Bytes": -100 + } + }, + { + "ph": "i", "s": "t", "name": "[memory]", + "pid": 13721, "tid": 123, + "ts": 715, + "args": { + "Device Type": 1, "Device Id": 0, "Addr": 715, "Bytes": 50 + } + }, + { + "ph": "i", "s": "t", "name": "[memory]", + "pid": 13721, "tid": 123, + "ts": 735, + "args": { + "Device Type": 1, "Device Id": 0, "Addr": 715, "Bytes": -50 + } + }, + { + "ph": "i", "s": "t", "name": "[memory]", + "pid": 13721, "tid": 123, + "ts": 725, + "args": { + "Device Type": 1, "Device Id": 0, "Addr": 725, "Bytes": 50 + } + }, + { + "ph": "i", "s": "t", "name": "[memory]", + "pid": 13721, "tid": 123, + "ts": 728, + "args": { + "Device Type": 1, "Device Id": 0, "Addr": 725, "Bytes": -50 + } + }, + { + "ph": "i", "s": "t", "name": "[memory]", + "pid": 13721, "tid": 123, + "ts": 729, + "args": { + "Device Type": 0, "Device Id": -1, "Addr": 729, "Bytes": 50 + } + }, + { + "ph": "i", "s": "t", "name": "[memory]", + "pid": 13721, "tid": 123, + "ts": 746, + "args": { + "Device Type": 0, "Device Id": -1, "Addr": 746, "Bytes": 100 + } + }, + { + "ph": "i", "s": "t", "name": "[memory]", + "pid": 13721, "tid": 123, + "ts": 747, + "args": { + "Device Type": 1, "Device Id": 0, "Addr": 747, "Bytes": 20 + } + }, + { + "ph": "i", "s": "t", "name": "[memory]", + "pid": 13721, "tid": 123, + "ts": 749, + "args": { + "Device Type": 0, "Device Id": -1, "Addr": 690, "Bytes": -100 + } + }, + { + "ph": "i", "s": "t", "name": "[memory]", + "pid": 13721, "tid": 123, + "ts": 760, + "args": { + "Device Type": 1, "Device Id": 0, "Addr": 760, "Bytes": 30 + } + }, + { + "ph": "i", "s": "t", "name": "[memory]", + "pid": 13721, "tid": 123, + "ts": 780, + "args": { + "Device Type": 1, "Device Id": 0, "Addr": 760, "Bytes": -30 + } + }, + { + "ph": "i", "s": "t", "name": "[memory]", + "pid": 13721, "tid": 123, + "ts": 795, + "args": { + "Device Type": 1, "Device Id": 0, "Addr": 795, "Bytes": 10 + } + }, + { + "ph": "i", "s": "t", "name": "[memory]", + "pid": 13721, "tid": 123, + "ts": 799, + "args": { + "Device Type": 1, "Device Id": 0, "Addr": 795, "Bytes": -10 + } + } + ] + """ + import logging + + from torch_tb_profiler.utils import get_logger + logger = get_logger() + logger.addHandler(logging.StreamHandler()) + + profile = parse_json_trace(json_content) + profile.process() + memory_stats = profile.memory_snapshot.get_memory_statistics(profile.tid2tree) + + self.assertEqual(len(memory_stats), 2) + self.assertIn('GPU0', memory_stats) + + # validation + gpu_expected_data = { + # self increase size, self allocation size, self allocation count, increase size, allocation size, allocation count, call # noqa: E501 + 'aten::to': [104, 104, 2, 104, 204, 3, 4], + 'aten::nll_loss_backward': [0, 10, 1, 0, 10, 1, 1], + 'aten::copy_': [0, 100, 1, 0, 100, 1, 1], + 'aten::addmm': [0, 30, 1, 0, 30, 1, 1], + 'aten::tranas_stride': [0, 50, 1, 0, 50, 1, 1], + 'aten::transpose': [0, 50, 1, 0, 100, 2, 1], + 'aten::t': [0, 100, 1, 0, 200, 3, 1], + 'aten::liner': [20, 130, 3, 20, 360, 7, 1] + } + + cpu_expected_data = { + 'aten::to': [4, 4, 1, 4, 4, 1, 4], + 'aten::liner': [0, 100, 1, 50, 150, 2, 1], + 'aten::tranas_stride': [50, 50, 1, 50, 50, 1, 1], + 'aten::transpose': [0, 0, 0, 50, 50, 1, 1], + 'aten::t': [0, 0, 0, 50, 50, 1, 1] + } + + validate_data = [ + (memory_stats['CPU'], cpu_expected_data), + (memory_stats['GPU0'], gpu_expected_data) + ] + for (mem_stat, expected_data) in validate_data: + for name, values in expected_data.items(): + self.assertEqual(mem_stat[name], values) + + # Test group by 'kernel detail + op name'. + def test_group_by_kernel_columns(self): + json_content = """[ + { + "ph": "X", "cat": "Operator", + "name": "op1", "pid": 13721, "tid": "123", + "ts": 200, "dur": 60, + "args": {"Input Dims": [[2, 8, 5], [], [], [], [], [], [], []], "External id": 3} + }, + { + "ph": "X", "cat": "Operator", + "name": "op2", "pid": 13721, "tid": "456", + "ts": 340, "dur": 70, + "args": {"Input Dims": [[], [32, 1000], [32], [], [], [], []], "External id": 4} + }, + { + "ph": "X", "cat": "Kernel", + "name": "kernel1", "pid": 0, "tid": "stream 7", + "ts": 230, "dur": 15, + "args": {"correlation": 1000, "external id": 3, "device": 0, + "grid": [16, 1, 1], "block": [16, 16, 16], "registers per thread": 18, "shared memory": 0} + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 13721, "tid": "456", + "ts": 210, "dur": 5, + "args": {"correlation": 1000, "external id": 3} + }, + { + "ph": "X", "cat": "Kernel", + "name": "kernel1", "pid": 0, "tid": "stream 7", + "ts": 250, "dur": 10, + "args": {"correlation": 1001, "external id": 3, "device": 0, + "grid": [16, 1, 1], "block": [16, 16, 16], "registers per thread": 18, "shared memory": 0} + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 13721, "tid": "456", + "ts": 215, "dur": 5, + "args": {"correlation": 1001, "external id": 3} + }, + { + "ph": "X", "cat": "Kernel", + "name": "kernel1", "pid": 0, "tid": "stream 7", + "ts": 250, "dur": 13, + "args": {"correlation": 1002, "external id": 3, "device": 0, + "grid": [16, 1, 1], "block": [16, 16, 64], "registers per thread": 18, "shared memory": 0} + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 13721, "tid": "456", + "ts": 220, "dur": 5, + "args": {"correlation": 1002, "external id": 3} + }, + { + "ph": "X", "cat": "Kernel", + "name": "kernel1", "pid": 0, "tid": "stream 7", + "ts": 250, "dur": 17, + "args": {"correlation": 1003, "external id": 4, "device": 0, + "grid": [16, 1, 1], "block": [16, 16, 64], "registers per thread": 18, "shared memory": 0} + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 13721, "tid": "456", + "ts": 350, "dur": 5, + "args": {"correlation": 1003, "external id": 4} + } + ] + """ + profile = parse_json_trace(json_content) + profile.process() + expected_agg_kernels = [ + { + 'name': 'kernel1', + 'op_name': 'op1', + 'grid': '[16, 1, 1]', + 'block': '[16, 16, 16]', + 'registers per thread': 18, + 'shared memory': 0, + 'calls': 2, + 'total_duration': 15 + 10, + 'avg_duration': (15 + 10) / 2, + 'min_duration': min(15, 10), + 'max_duration': max(15, 10) + }, + { + 'name': 'kernel1', + 'op_name': 'op1', + 'grid': '[16, 1, 1]', + 'block': '[16, 16, 64]', # Only changed this. + 'registers per thread': 18, + 'shared memory': 0, + 'calls': 1, + 'total_duration': 13, + 'avg_duration': 13, + 'min_duration': 13, + 'max_duration': 13 + }, + { + 'name': 'kernel1', + 'op_name': 'op2', # Only changed this. + 'grid': '[16, 1, 1]', + 'block': '[16, 16, 64]', + 'registers per thread': 18, + 'shared memory': 0, + 'calls': 1, + 'total_duration': 17, + 'avg_duration': 17, + 'min_duration': 17, + 'max_duration': 17 + } + ] + index = 0 + self.assertEqual(len(profile.kernel_list_groupby_name_op), len(expected_agg_kernels)) + for agg_kernel in profile.kernel_list_groupby_name_op: + expected_agg_kernel = expected_agg_kernels[index] + self.assertEqual(agg_kernel.name, expected_agg_kernel['name']) + self.assertEqual(agg_kernel.op_name, expected_agg_kernel['op_name']) + self.assertEqual(str(agg_kernel.grid), expected_agg_kernel['grid']) + self.assertEqual(str(agg_kernel.block), expected_agg_kernel['block']) + self.assertEqual(agg_kernel.regs_per_thread, expected_agg_kernel['registers per thread']) + self.assertEqual(agg_kernel.shared_memory, expected_agg_kernel['shared memory']) + self.assertEqual(agg_kernel.calls, expected_agg_kernel['calls']) + self.assertEqual(agg_kernel.total_duration, expected_agg_kernel['total_duration']) + self.assertAlmostEqual(agg_kernel.avg_duration, expected_agg_kernel['avg_duration']) + self.assertEqual(agg_kernel.min_duration, expected_agg_kernel['min_duration']) + self.assertEqual(agg_kernel.max_duration, expected_agg_kernel['max_duration']) + index += 1 + + # Test group by 'kernel detail + op name' with invalid input lack of some kernel field + def test_group_by_kernel_columns_invalid_input(self): + json_content = """[ + { + "ph": "X", "cat": "Operator", + "name": "op1", "pid": 13721, "tid": "123", + "ts": 200, "dur": 60, + "args": {"Input Dims": [[2, 8, 5], [], [], [], [], [], [], []], "External id": 3} + }, + { + "ph": "X", "cat": "Kernel", + "name": "kernel1", "pid": 0, "tid": "stream 7", + "ts": 220, "dur": 1, + "args": {"correlation": 1000, "external id": 3, "device": 0, + "block": [16, 16, 16], "registers per thread": 18, "shared memory": 0} + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 13721, "tid": "456", + "ts": 210, "dur": 5, + "args": {"correlation": 1000, "external id": 3} + }, + { + "ph": "X", "cat": "Kernel", + "name": "kernel1", "pid": 0, "tid": "stream 7", + "ts": 230, "dur": 2, + "args": {"correlation": 1001, "external id": 3, "device": 0, + "grid": [16, 1, 1], "registers per thread": 18, "shared memory": 0} + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 13721, "tid": "456", + "ts": 220, "dur": 5, + "args": {"correlation": 1001, "external id": 3} + }, + { + "ph": "X", "cat": "Kernel", + "name": "kernel1", "pid": 0, "tid": "stream 7", + "ts": 240, "dur": 3, + "args": {"correlation": 1002, "external id": 3, "device": 0, + "grid": [16, 1, 1], "block": [16, 16, 16], "shared memory": 0} + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 13721, "tid": "456", + "ts": 230, "dur": 5, + "args": {"correlation": 1002, "external id": 3} + }, + { + "ph": "X", "cat": "Kernel", + "name": "kernel1", "pid": 0, "tid": "stream 7", + "ts": 250, "dur": 4, + "args": {"correlation": 1003, "external id": 3, "device": 0, + "grid": [16, 1, 1], "block": [16, 16, 16], "registers per thread": 18} + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 13721, "tid": "456", + "ts": 240, "dur": 5, + "args": {"correlation": 1003, "external id": 3} + }, + { + "ph": "X", "cat": "Kernel", + "name": "kernel1", "pid": 0, "tid": "stream 7", + "ts": 260, "dur": 5, + "args": {"correlation": 1004, "external id": 3, "device": 0} + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 13721, "tid": "456", + "ts": 250, "dur": 5, + "args": {"correlation": 1004, "external id": 3} + } + ] + """ + profile = parse_json_trace(json_content) + profile.process() + expected_agg_kernels = [ + { + 'name': 'kernel1', + 'op_name': 'op1', + 'grid': None, + 'block': [16, 16, 16], + 'registers per thread': 18, + 'shared memory': 0, + 'calls': 1, + 'total_duration': 1, + 'avg_duration': 1, + 'min_duration': 1, + 'max_duration': 1 + }, + { + 'name': 'kernel1', + 'op_name': 'op1', + 'grid': [16, 1, 1], + 'block': None, + 'registers per thread': 18, + 'shared memory': 0, + 'calls': 1, + 'total_duration': 2, + 'avg_duration': 2, + 'min_duration': 2, + 'max_duration': 2 + }, + { + 'name': 'kernel1', + 'op_name': 'op1', + 'grid': [16, 1, 1], + 'block': [16, 16, 16], + 'registers per thread': None, + 'shared memory': 0, + 'calls': 1, + 'total_duration': 3, + 'avg_duration': 3, + 'min_duration': 3, + 'max_duration': 3 + }, + { + 'name': 'kernel1', + 'op_name': 'op1', + 'grid': [16, 1, 1], + 'block': [16, 16, 16], + 'registers per thread': 18, + 'shared memory': None, + 'calls': 1, + 'total_duration': 4, + 'avg_duration': 4, + 'min_duration': 4, + 'max_duration': 4 + }, + { + 'name': 'kernel1', + 'op_name': 'op1', + 'grid': None, + 'block': None, + 'registers per thread': None, + 'shared memory': None, + 'calls': 1, + 'total_duration': 5, + 'avg_duration': 5, + 'min_duration': 5, + 'max_duration': 5 + } + ] + index = 0 + self.assertEqual(len(profile.kernel_list_groupby_name_op), len(expected_agg_kernels)) + for agg_kernel in profile.kernel_list_groupby_name_op: + expected_agg_kernel = expected_agg_kernels[index] + self.assertEqual(agg_kernel.name, expected_agg_kernel['name']) + self.assertEqual(agg_kernel.op_name, expected_agg_kernel['op_name']) + self.assertEqual(agg_kernel.grid, expected_agg_kernel['grid']) + self.assertEqual(agg_kernel.block, expected_agg_kernel['block']) + self.assertEqual(agg_kernel.regs_per_thread, expected_agg_kernel['registers per thread']) + print(agg_kernel.name, agg_kernel.grid, agg_kernel.block, agg_kernel.shared_memory) + self.assertEqual(agg_kernel.shared_memory, expected_agg_kernel['shared memory']) + self.assertEqual(agg_kernel.calls, expected_agg_kernel['calls']) + self.assertEqual(agg_kernel.total_duration, expected_agg_kernel['total_duration']) + self.assertAlmostEqual(agg_kernel.avg_duration, expected_agg_kernel['avg_duration']) + self.assertEqual(agg_kernel.min_duration, expected_agg_kernel['min_duration']) + self.assertEqual(agg_kernel.max_duration, expected_agg_kernel['max_duration']) + index += 1 + + # Test tensor core related feature. + def test_tensor_core(self): + json_content = """[ + { + "ph": "X", "cat": "Operator", + "name": "aten::conv2d", "pid": 13721, "tid": "123", + "ts": 200, "dur": 100, + "args": {"Input Dims": [[]], "External id": 3} + }, + { + "ph": "X", "cat": "Operator", + "name": "op_no_tc", "pid": 13721, "tid": "123", + "ts": 205, "dur": 10, + "args": {"Input Dims": [[]], "External id": 4} + }, + { + "ph": "X", "cat": "Operator", + "name": "aten::cudnn_convolution", "pid": 13721, "tid": "123", + "ts": 215, "dur": 10, + "args": {"Input Dims": [[]], "External id": 5} + }, + { + "ph": "X", "cat": "Kernel", + "name": "kernel_no_tc", "pid": 0, "tid": "stream 7", + "ts": 210, "dur": 10, + "args": {"correlation": 1000, "external id": 4, "device": 0} + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 13721, "tid": "123", + "ts": 205, "dur": 5, + "args": {"correlation": 1000, "external id": 4} + }, + { + "ph": "X", "cat": "Kernel", + "name": "volta_fp16_s884cudnn_fp16_128x128_ldg8_splitK_relu_f2f_exp_small_nhwc_tn_v1", + "pid": 0, "tid": "stream 7", + "ts": 220, "dur": 15, + "args": {"correlation": 1001, "external id": 5, "device": 0} + }, + { + "ph": "X", "cat": "Runtime", + "name": "cudaLaunchKernel", "pid": 13721, "tid": "123", + "ts": 215, "dur": 5, + "args": {"correlation": 1001, "external id": 5} + } + ] + """ + profile = parse_json_trace(json_content) + profile.process() + + expected_agg_ops = { + 'aten::conv2d': { + 'tc_eligible': True, + 'tc_self_ratio': 0, + 'tc_total_ratio': 15 / (15 + 10) + }, + 'op_no_tc': { + 'tc_eligible': False, + 'tc_self_ratio': 0, + 'tc_total_ratio': 0 + }, + 'aten::cudnn_convolution': { + 'tc_eligible': True, + 'tc_self_ratio': 1.0, + 'tc_total_ratio': 1.0 + } + } + self.assertEqual(len(profile.op_list_groupby_name), len(expected_agg_ops)) + for agg_op in profile.op_list_groupby_name: + expected_agg_op = expected_agg_ops[agg_op.name] + self.assertEqual(agg_op.tc_eligible, expected_agg_op['tc_eligible']) + self.assertAlmostEqual(agg_op.tc_self_ratio, expected_agg_op['tc_self_ratio']) + self.assertAlmostEqual(agg_op.tc_total_ratio, expected_agg_op['tc_total_ratio']) + + expected_kernels_groupby_op = { + 'kernel_no_tc': { + 'op_name': 'op_no_tc', + 'tc_used': False, + 'op_tc_eligible': False + }, + 'volta_fp16_s884cudnn_fp16_128x128_ldg8_splitK_relu_f2f_exp_small_nhwc_tn_v1': { + 'op_name': 'aten::cudnn_convolution', + 'tc_used': True, + 'op_tc_eligible': True + } + } + self.assertEqual(len(profile.kernel_list_groupby_name_op), len(expected_kernels_groupby_op)) + for agg_kernel in profile.kernel_list_groupby_name_op: + expected_agg_kernel = expected_kernels_groupby_op[agg_kernel.name] + self.assertEqual(agg_kernel.op_name, expected_agg_kernel['op_name']) + self.assertEqual(agg_kernel.tc_used, expected_agg_kernel['tc_used']) + self.assertEqual(agg_kernel.op_tc_eligible, expected_agg_kernel['op_tc_eligible']) + + self.assertAlmostEqual(profile.tc_ratio[0], 15 / (15 + 10)) + self.assertAlmostEqual(profile.tc_eligible_ops_kernel_ratio, 15 / (15 + 10)) + + +class TestDistributed(unittest.TestCase): + + def test_distributed_nccl(self): + json_content0 = """[ + { + "ph": "X", "cat": "cpu_op", + "name": "nccl:broadcast", "pid": 23803, "tid": "23803", + "ts": 0, "dur": 75, + "args": {"External id": 146, "Input Dims": [[53120]], "Input type": ["float"]} + }, + { + "ph": "X", "cat": "Kernel", + "name": "ncclKernel_Broadcast_RING_LL_Sum_int8_t(ncclWorkElem)", "pid": 0, "tid": "stream 16", + "ts": 16, "dur": 16, + "args": {"device": 0, "correlation": 28506, "external id": 146} + }, + { + "ph": "X", "cat": "cpu_op", + "name": "aten::add_", "pid": 23803, "tid": "23803", + "ts": 100, "dur": 20, + "args": {"External id": 24504, "Input Dims": [[1000], [1000], []], "Input type": ["float", "float", "Int"]} + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::vectorized_elementwise_kernel", "pid": 0, "tid": "stream 7", + "ts": 130, "dur": 161, + "args": {"device": 0, "correlation": 99765, "external id": 24504} + }, + { + "ph": "X", "cat": "cpu_op", + "name": "nccl:all_reduce", "pid": 23803, "tid": "25166", + "ts": 160, "dur": 75, + "args": {"External id": 2513, "Input Dims": [[2049000]], "Input type": ["float"]} + }, + { + "ph": "X", "cat": "Kernel", + "name": "ncclKernel_AllReduce_RING_LL_Sum_float(ncclWorkElem)", "pid": 0, "tid": "stream 16", + "ts": 162, "dur": 1556, + "args": {"device": 0, "correlation": 33218, "external id": 2513} + } + ] + """ + json_content1 = """[ + { + "ph": "X", "cat": "cpu_op", + "name": "nccl:broadcast", "pid": 23803, "tid": "23803", + "ts": 0, "dur": 20, + "args": {"External id": 146, "Input Dims": [[53120]], "Input type": ["float"]} + }, + { + "ph": "X", "cat": "Kernel", + "name": "ncclKernel_Broadcast_RING_LL_Sum_int8_t(ncclWorkElem)", "pid": 0, "tid": "stream 16", + "ts": 8, "dur": 31, + "args": {"device": 0, "correlation": 28506, "external id": 146} + }, + { + "ph": "X", "cat": "cpu_op", + "name": "aten::add_", "pid": 23803, "tid": "23803", + "ts": 25, "dur": 20, + "args": {"External id": 24504, "Input Dims": [[1000], [1000], []], "Input type": ["float", "float", "Int"]} + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::vectorized_elementwise_kernel", "pid": 0, "tid": "stream 7", + "ts": 30, "dur": 161, + "args": {"device": 0, "correlation": 99765, "external id": 24504} + }, + { + "ph": "X", "cat": "cpu_op", + "name": "nccl:all_reduce", "pid": 23803, "tid": "25166", + "ts": 160, "dur": 75, + "args": {"External id": 2513, "Input Dims": [[2049000]], "Input type": ["float"]} + }, + { + "ph": "X", "cat": "Kernel", + "name": "ncclKernel_AllReduce_RING_LL_Sum_float(ncclWorkElem)", "pid": 0, "tid": "stream 16", + "ts": 562, "dur": 1058, + "args": {"device": 0, "correlation": 33218, "external id": 2513} + } + ] + """ + + profile0 = parse_json_trace(json_content0, 'worker0') + dist_data0 = DistributedRunProfileData(profile0) + self.assertTrue(profile0.has_communication) + self.assertEqual(len(profile0.comm_node_list), 2) + self.assertEqual(profile0.steps_costs[0].costs, [105, 0, 0, 16, 0, 0, 79, 35, 235]) + + profile1 = parse_json_trace(json_content1, 'worker1') + dist_data1 = DistributedRunProfileData(profile1) + self.assertTrue(profile1.has_communication) + self.assertEqual(len(profile1.comm_node_list), 2) + self.assertEqual(profile1.steps_costs[0].costs[3], 22) + + loader = RunLoader('test_nccl', '', None) + dist_profile = loader._process_distributed_profiles([dist_data0, dist_data1], 0) + self.assertEqual(dist_profile.steps_to_overlap['data']['0']['worker0'], [32, 73, 16, 114]) + self.assertEqual(dist_profile.steps_to_overlap['data']['0']['worker1'], [152, 9, 22, 52]) + self.assertEqual(dist_profile.steps_to_wait['data']['0']['worker0'], [1074, 498]) + self.assertEqual(dist_profile.steps_to_wait['data']['0']['worker1'], [1074, 15]) + self.assertEqual(dist_profile.comm_ops['data']['worker0']['rows'], + [['nccl:broadcast', 1, 212480, 212480, 16, 16, 16, 16], + ['nccl:all_reduce', 1, 8196000, 8196000, 1556, 1556, 1058, 1058]]) + self.assertEqual(dist_profile.comm_ops['data']['worker1']['rows'], + [['nccl:broadcast', 1, 212480, 212480, 31, 31, 16, 16], + ['nccl:all_reduce', 1, 8196000, 8196000, 1058, 1058, 1058, 1058]]) + + def test_distributed_gloo_gpu(self): + json_content0 = """[ + { + "ph": "X", "cat": "cpu_op", + "name": "gloo:broadcast", "pid": 23803, "tid": "23803", + "ts": 16, "dur": 38, + "args": {"External id": 165, "Input Dims": [[53120]], "Input type": ["float"]} + }, + { + "ph": "X", "cat": "cpu_op", + "name": "gloo:broadcast", "pid": 23803, "tid": "23805", + "ts": 25, "dur": 36, + "args": {"External id": 166, "Input Dims": [[53120]], "Input type": ["float"]} + }, + { + "ph": "X", "cat": "cpu_op", + "name": "gloo:broadcast", "pid": 23803, "tid": "23803", + "ts": 66, "dur": 18, + "args": {"External id": 167, "Input Dims": [[53120]], "Input type": ["float"]} + }, + { + "ph": "X", "cat": "cpu_op", + "name": "aten::add_", "pid": 23803, "tid": "23800", + "ts": 0, "dur": 20, + "args": {"External id": 24504, "Input Dims": [[1000], [1000], []], "Input type": ["float", "float", "Int"]} + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::vectorized_elementwise_kernel", "pid": 0, "tid": "stream 7", + "ts": 30, "dur": 101, + "args": {"device": 0, "correlation": 99765, "external id": 24504} + }, + { + "ph": "X", "cat": "cpu_op", + "name": "gloo:all_reduce", "pid": 23803, "tid": "23805", + "ts": 110, "dur": 18, + "args": {"External id": 2513, "Input Dims": [[2049000]], "Input type": ["float"]} + }, + { + "ph": "X", "cat": "cpu_op", + "name": "gloo:all_reduce", "pid": 23803, "tid": "23803", + "ts": 120, "dur": 36, + "args": {"External id": 2516, "Input Dims": [[2049000]], "Input type": ["float"]} + } + ] + """ + json_content1 = """[ + { + "ph": "X", "cat": "cpu_op", + "name": "gloo:broadcast", "pid": 23803, "tid": "23803", + "ts": 20, "dur": 28, + "args": {"External id": 256, "Input Dims": [[53120]], "Input type": ["float"]} + }, + { + "ph": "X", "cat": "cpu_op", + "name": "gloo:broadcast", "pid": 23803, "tid": "23805", + "ts": 28, "dur": 30, + "args": {"External id": 257, "Input Dims": [[53120]], "Input type": ["float"]} + }, + { + "ph": "X", "cat": "cpu_op", + "name": "gloo:broadcast", "pid": 23803, "tid": "23803", + "ts": 77, "dur": 6, + "args": {"External id": 258, "Input Dims": [[53120]], "Input type": ["float"]} + }, + { + "ph": "X", "cat": "cpu_op", + "name": "aten::add_", "pid": 23803, "tid": "23800", + "ts": 0, "dur": 30, + "args": {"External id": 24504, "Input Dims": [[1000], [1000], []], "Input type": ["float", "float", "Int"]} + }, + { + "ph": "X", "cat": "Kernel", + "name": "void at::native::vectorized_elementwise_kernel", "pid": 0, "tid": "stream 7", + "ts": 70, "dur": 70, + "args": {"device": 0, "correlation": 99765, "external id": 24504} + }, + { + "ph": "X", "cat": "cpu_op", + "name": "gloo:all_reduce", "pid": 23803, "tid": "23805", + "ts": 88, "dur": 38, + "args": {"External id": 2513, "Input Dims": [[2049000]], "Input type": ["float"]} + }, + { + "ph": "X", "cat": "cpu_op", + "name": "gloo:all_reduce", "pid": 23803, "tid": "23803", + "ts": 130, "dur": 16, + "args": {"External id": 2516, "Input Dims": [[2049000]], "Input type": ["float"]} + } + ] + """ + + profile0 = parse_json_trace(json_content0, 'worker0') + dist_data0 = DistributedRunProfileData(profile0) + self.assertTrue(profile0.has_communication) + self.assertEqual(len(profile0.comm_node_list), 5) + self.assertEqual(profile0.steps_costs[0].costs, [101, 0, 0, 39, 0, 0, 16, 0, 156]) + + profile1 = parse_json_trace(json_content1, 'worker1') + dist_data1 = DistributedRunProfileData(profile1) + self.assertTrue(profile1.has_communication) + self.assertEqual(len(profile1.comm_node_list), 5) + self.assertEqual(profile1.steps_costs[0].costs, [70, 0, 0, 44, 0, 0, 20, 12, 146]) + + loader = RunLoader('test_gloo_gpu', '', None) + dist_profile = loader._process_distributed_profiles([dist_data0, dist_data1], 0) + self.assertEqual(dist_profile.steps_to_overlap['data']['0']['worker0'], [31, 70, 39, 16]) + self.assertEqual(dist_profile.steps_to_overlap['data']['0']['worker1'], [16, 54, 44, 32]) + self.assertEqual(dist_profile.steps_to_wait['data']['0']['worker0'], [75, 34]) + self.assertEqual(dist_profile.steps_to_wait['data']['0']['worker1'], [78, 20]) + self.assertEqual(dist_profile.comm_ops['data']['worker0']['rows'], + [['gloo:broadcast', 3, 637440, 212480, 63, 21, 41, 14], + ['gloo:all_reduce', 2, 16392000, 8196000, 46, 23, 34, 17]]) + self.assertEqual(dist_profile.comm_ops['data']['worker1']['rows'], + [['gloo:broadcast', 3, 637440, 212480, 44, 15, 44, 15], + ['gloo:all_reduce', 2, 16392000, 8196000, 54, 27, 34, 17]]) + + def test_distributed_gloo_cpu(self): + json_content0 = """[ + { + "ph": "X", "cat": "cpu_op", + "name": "gloo:broadcast", "pid": 23803, "tid": "23803", + "ts": 16, "dur": 38, + "args": {"External id": 165, "Input Dims": [[53120]], "Input type": ["float"]} + }, + { + "ph": "X", "cat": "cpu_op", + "name": "gloo:broadcast", "pid": 23803, "tid": "23805", + "ts": 25, "dur": 36, + "args": {"External id": 166, "Input Dims": [[53120]], "Input type": ["float"]} + }, + { + "ph": "X", "cat": "cpu_op", + "name": "gloo:broadcast", "pid": 23803, "tid": "23803", + "ts": 66, "dur": 18, + "args": {"External id": 167, "Input Dims": [[53120]], "Input type": ["float"]} + }, + { + "ph": "X", "cat": "cpu_op", + "name": "aten::add_", "pid": 23803, "tid": "23800", + "ts": 0, "dur": 20, + "args": {"External id": 24504, "Input Dims": [[1000], [1000], []], "Input type": ["float", "float", "Int"]} + }, + { + "ph": "X", "cat": "cpu_op", + "name": "aten::mul", "pid": 23803, "tid": "23800", + "ts": 30, "dur": 101, + "args": {"External id": 24505} + }, + { + "ph": "X", "cat": "cpu_op", + "name": "gloo:all_reduce", "pid": 23803, "tid": "23805", + "ts": 110, "dur": 18, + "args": {"External id": 2513, "Input Dims": [[2049000]], "Input type": ["float"]} + }, + { + "ph": "X", "cat": "cpu_op", + "name": "gloo:all_reduce", "pid": 23803, "tid": "23803", + "ts": 120, "dur": 36, + "args": {"External id": 2516, "Input Dims": [[2049000]], "Input type": ["float"]} + } + ] + """ + json_content1 = """[ + { + "ph": "X", "cat": "cpu_op", + "name": "gloo:broadcast", "pid": 23803, "tid": "23803", + "ts": 20, "dur": 28, + "args": {"External id": 256, "Input Dims": [[53120]], "Input type": ["float"]} + }, + { + "ph": "X", "cat": "cpu_op", + "name": "gloo:broadcast", "pid": 23803, "tid": "23805", + "ts": 28, "dur": 30, + "args": {"External id": 257, "Input Dims": [[53120]], "Input type": ["float"]} + }, + { + "ph": "X", "cat": "cpu_op", + "name": "gloo:broadcast", "pid": 23803, "tid": "23803", + "ts": 77, "dur": 6, + "args": {"External id": 258, "Input Dims": [[53120]], "Input type": ["float"]} + }, + { + "ph": "X", "cat": "cpu_op", + "name": "aten::add_", "pid": 23803, "tid": "23800", + "ts": 0, "dur": 30, + "args": {"External id": 24504, "Input Dims": [[1000], [1000], []], "Input type": ["float", "float", "Int"]} + }, + { + "ph": "X", "cat": "cpu_op", + "name": "aten::mul", "pid": 23803, "tid": "23800", + "ts": 70, "dur": 70, + "args": {"External id": 24505} + }, + { + "ph": "X", "cat": "cpu_op", + "name": "gloo:all_reduce", "pid": 23803, "tid": "23805", + "ts": 88, "dur": 38, + "args": {"External id": 2513, "Input Dims": [[2049000]], "Input type": ["float"]} + }, + { + "ph": "X", "cat": "cpu_op", + "name": "gloo:all_reduce", "pid": 23803, "tid": "23803", + "ts": 130, "dur": 16, + "args": {"External id": 2516, "Input Dims": [[2049000]], "Input type": ["float"]} + } + ] + """ + + profile0 = parse_json_trace(json_content0, 'worker0') + dist_data0 = DistributedRunProfileData(profile0) + self.assertTrue(profile0.has_communication) + self.assertEqual(len(profile0.comm_node_list), 5) + self.assertEqual(profile0.steps_costs[0].costs, [0, 0, 0, 109, 0, 0, 47, 0, 156]) + + profile1 = parse_json_trace(json_content1, 'worker1') + dist_data1 = DistributedRunProfileData(profile1) + self.assertTrue(profile1.has_communication) + self.assertEqual(len(profile1.comm_node_list), 5) + self.assertEqual(profile1.steps_costs[0].costs, [0, 0, 0, 98, 0, 0, 36, 12, 146]) + + loader = RunLoader('test_gloo_cpu', '', None) + dist_profile = loader._process_distributed_profiles([dist_data0, dist_data1], 0) + self.assertEqual(dist_profile.steps_to_overlap['data']['0']['worker0'], [47, 74, 35, 0]) + self.assertEqual(dist_profile.steps_to_overlap['data']['0']['worker1'], [36, 64, 34, 12]) + self.assertEqual(dist_profile.steps_to_wait['data']['0']['worker0'], [75, 34]) + self.assertEqual(dist_profile.steps_to_wait['data']['0']['worker1'], [78, 20]) + self.assertEqual(dist_profile.comm_ops['data']['worker0']['rows'], + [['gloo:broadcast', 3, 637440, 212480, 63, 21, 41, 14], + ['gloo:all_reduce', 2, 16392000, 8196000, 46, 23, 34, 17]]) + self.assertEqual(dist_profile.comm_ops['data']['worker1']['rows'], + [['gloo:broadcast', 3, 637440, 212480, 44, 15, 44, 15], + ['gloo:all_reduce', 2, 16392000, 8196000, 54, 27, 34, 17]]) + + +class TestMemoryCurve(unittest.TestCase): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + self.event_data_cpu = [ + [1, 0, 0, 1, 4, 4, 0], # alloc 1 + [20, 0, 0, 1, -4, 0, 0], # free 1 + [100, 0, 0, 2, 8000, 8000, 0], # alloc 2 + [200, 0, 0, 2, -8000, 0, 0], # free 2 + [300, 0, 0, 3, 4, 4, 0], # alloc 3 + [400, 0, 0, 4, 16, 20, 0], # alloc 4 + [500, 0, 0, 5, 4000, 4020, 0], # alloc 5 + [600, 0, 0, 4, -16, 4004, 0], # free 4 + [700, 0, 0, 7, 80, 4084, 0], # alloc 7 + [800, 0, 0, 3, -4, 4080, 0], # free 3 + [900, 0, 0, 7, -80, 4000, 0], # free 7 + [905, 0, 0, 4, -4000, 0, 0], # free 5 + ] + + self.event_data_gpu = [ + [2, 1, 0, 11, 400, 400, 512], # alloc 11 + [22, 1, 0, 11, -400, 0, 512], # free 11 + [105, 1, 0, 12, 5000, 5000, 10240], # alloc 12 + [106, 1, 0, 13, 3000, 8000, 10240], # alloc 13 + [205, 1, 0, 12, -5000, 3000, 10240], # free 12 + [401, 1, 0, 14, 1024, 4024, 10240], # alloc 14 + [499, 1, 0, 15, 4, 4028, 10240], # alloc 15 + [501, 1, 0, 13, -3000, 1028, 10240], # free 13 + [502, 1, 0, 15, -4, 1024, 10240], # free 15 + [906, 1, 0, 14, -1024, 0, 10240], # free 14 + ] + + self.all_events = sorted(self.event_data_cpu + self.event_data_gpu, key=lambda e: e[0]) + + def entry(self, ts, dev, dev_id, addr, alloc_size, total_allocated, total_reserved): + return { + 'ph': 'i', 's': 't', 'name': '[memory]', 'pid': 0, 'tid': 0, 'ts': ts, + 'args': { + 'Device Type': dev, + 'Device Id': dev_id, + 'Addr': addr, + 'Bytes': alloc_size, + 'Total Allocated': total_allocated, + 'Total Reserved': total_reserved, + }, + } + + def test_memory_curve_no_step_plot(self): + json_content = json.dumps([self.entry(*data) for data in self.all_events]) + + profile = parse_json_trace(json_content) + profile.process() + result = RunProfile.get_memory_curve(profile, time_metric='us', memory_metric='B', patch_for_step_plot=False) + + start_ts = profile.profiler_start_ts + self.assertEqual(1, start_ts) + + curves = result['rows'] + + self.assertIn('CPU', curves) + self.assertIn('GPU0', curves) + + self.assertEqual(len(self.event_data_cpu), len(curves['CPU'])) + for i in range(len(self.event_data_cpu)): + # adjusted timestamp + self.assertEqual(self.event_data_cpu[i][0] - start_ts, curves['CPU'][i][0]) + # total allocated + self.assertEqual(self.event_data_cpu[i][-2], curves['CPU'][i][1]) + # total reserved + self.assertEqual(self.event_data_cpu[i][-1], curves['CPU'][i][2]) + + self.assertEqual(len(self.event_data_gpu), len(curves['GPU0'])) + for i in range(len(self.event_data_gpu)): + self.assertEqual(self.event_data_gpu[i][0] - start_ts, curves['GPU0'][i][0]) + self.assertEqual(self.event_data_gpu[i][-2], curves['GPU0'][i][1]) + self.assertEqual(self.event_data_gpu[i][-1], curves['GPU0'][i][2]) + + def test_memory_curve_step_plot(self): + json_content = json.dumps([self.entry(*data) for data in self.all_events]) + + profile = parse_json_trace(json_content) + profile.process() + result = RunProfile.get_memory_curve(profile, time_metric='us', memory_metric='B', patch_for_step_plot=True) + + start_ts = profile.profiler_start_ts + self.assertEqual(1, start_ts) + + curves = result['rows'] + + self.assertIn('CPU', curves) + self.assertIn('GPU0', curves) + + self.assertEqual(2 * len(self.event_data_cpu) - 1, len(curves['CPU'])) + for i in range(len(curves['CPU'])): + if i % 2 == 0: # original values + # adjusted timestamp + self.assertEqual(self.event_data_cpu[i//2][0] - start_ts, curves['CPU'][i][0]) + # total allocated + self.assertEqual(self.event_data_cpu[i//2][-2], curves['CPU'][i][1]) + # total reserved + self.assertEqual(self.event_data_cpu[i//2][-1], curves['CPU'][i][2]) + else: # interpolated values + self.assertEqual(self.event_data_cpu[i//2+1][0] - start_ts, curves['CPU'][i][0]) + self.assertEqual(self.event_data_cpu[i//2][-2], curves['CPU'][i][1]) + self.assertEqual(self.event_data_cpu[i//2][-1], curves['CPU'][i][2]) + + self.assertEqual(2 * len(self.event_data_gpu) - 1, len(curves['GPU0'])) + for i in range(len(self.event_data_gpu)): + if i % 2 == 0: # original values + self.assertEqual(self.event_data_gpu[i//2][0] - start_ts, curves['GPU0'][i][0]) + self.assertEqual(self.event_data_gpu[i//2][-2], curves['GPU0'][i][1]) + self.assertEqual(self.event_data_gpu[i//2][-1], curves['GPU0'][i][2]) + else: # interpolated values + self.assertEqual(self.event_data_gpu[i//2+1][0] - start_ts, curves['GPU0'][i][0]) + self.assertEqual(self.event_data_gpu[i//2][-2], curves['GPU0'][i][1]) + self.assertEqual(self.event_data_gpu[i//2][-1], curves['GPU0'][i][2]) + + +class TestModuleView(unittest.TestCase): + + def test_build_module_hierarchy(self): + from torch_tb_profiler.profiler import trace + from torch_tb_profiler.profiler.module_op import ( + _build_module_hierarchy, aggegate_module_view) + + json_content = """[ + { + "ph": "X", + "cat": "python_function", + "name": "test_root", + "pid": 1908, + "tid": 1908, + "ts": 1, + "dur": 19367, + "args": { + "External id": 0, + "Trace name": "PyTorch Profiler", + "Trace iteration": 0, + "Python id": 1, + "Python thread": 0 + } + }, + { + "ph": "X", + "cat": "python_function", + "name": "nn.Module: MyModule", + "pid": 1908, + "tid": 1908, + "ts": 2, + "dur": 211, + "args": { + "External id": 0, + "Trace name": "PyTorch Profiler", + "Trace iteration": 0, + "Python id": 2, + "Python parent id": 1, + "Python module id": 0 + } + }, + { + "ph": "X", + "cat": "python_function", + "name": "nn.Module: Linear", + "pid": 1908, + "tid": 1908, + "ts": 5, + "dur": 62, + "args": { + "External id": 0, + "Trace name": "PyTorch Profiler", + "Trace iteration": 0, + "Python id": 3, + "Python parent id": 2, + "Python thread": 0, + "Python module id": 1 + } + }, + { + "ph": "X", + "cat": "cpu_op", + "name": "aten::addmm", + "pid": 1908, + "tid": 1908, + "ts": 10, + "dur": 31, + "args": { + "External id": 12182, + "Trace name": "PyTorch Profiler", + "Trace iteration": 0, + "Fwd thread id": 0, + "Sequence number": 4006, + "python_caller_id": 3 + } + }, + { + "ph": "X", + "cat": "python_function", + "name": "nn.Module: MyModule", + "pid": 1908, + "tid": 1908, + "ts": 1000, + "dur": 211, + "args": { + "External id": 0, + "Trace name": "PyTorch Profiler", + "Trace iteration": 0, + "Python id": 4, + "Python parent id": 1, + "Python module id": 0 + } + }, + { + "ph": "X", + "cat": "python_function", + "name": "nn.Module: Linear", + "pid": 1908, + "tid": 1908, + "ts": 1001, + "dur": 62, + "args": { + "External id": 0, + "Trace name": "PyTorch Profiler", + "Trace iteration": 0, + "Python id": 5, + "Python parent id": 4, + "Python thread": 0, + "Python module id": 1 + } + }, + { + "ph": "X", + "cat": "cpu_op", + "name": "aten::addmm", + "pid": 1908, + "tid": 1908, + "ts": 1002, + "dur": 32, + "args": { + "External id": 12182, + "Trace name": "PyTorch Profiler", + "Trace iteration": 0, + "Fwd thread id": 0, + "Sequence number": 4006, + "python_caller_id": 5 + } + }, + { + "ph": "X", + "cat": "python_function", + "name": "nn.Module: MyModule", + "pid": 1908, + "tid": 1908, + "ts": 2000, + "dur": 211, + "args": { + "External id": 0, + "Trace name": "PyTorch Profiler", + "Trace iteration": 0, + "Python id": 6, + "Python parent id": 1, + "Python module id": 0 + } + }, + { + "ph": "X", + "cat": "python_function", + "name": "nn.Module: Linear", + "pid": 1908, + "tid": 1908, + "ts": 2001, + "dur": 62, + "args": { + "External id": 0, + "Trace name": "PyTorch Profiler", + "Trace iteration": 0, + "Python id": 7, + "Python parent id": 6, + "Python thread": 0, + "Python module id": 1 + } + }, + { + "ph": "X", + "cat": "cpu_op", + "name": "aten::addmm", + "pid": 1908, + "tid": 1908, + "ts": 2002, + "dur": 33, + "args": { + "External id": 12182, + "Trace name": "PyTorch Profiler", + "Trace iteration": 0, + "Fwd thread id": 0, + "Sequence number": 4006, + "python_caller_id": 7 + } + }, + { + "ph": "X", + "cat": "python_function", + "name": "nn.Module: Conv2", + "pid": 1908, + "tid": 1908, + "ts": 3000, + "dur": 211, + "args": { + "External id": 0, + "Trace name": "PyTorch Profiler", + "Trace iteration": 0, + "Python id": 8, + "Python parent id": 1, + "Python module id": 100 + } + } + ] + """ + data = parse_json_trace(json_content) + stats = aggegate_module_view(data.tid2tree, data.events) + stats.sort(key=lambda x: x.name) + self.assertEqual(2, len(stats)) + self.assertEqual('Conv2', stats[0].name) + self.assertEqual('MyModule', stats[1].name) + self.assertEqual(1, len(stats[1].children)) + self.assertEqual('Linear', stats[1].children[0].name) + + content = json.loads(json_content) + + events = [] + for data in content: + event = trace.create_event(data, False) + events.append(event) + + roots = _build_module_hierarchy(events) + roots.sort(key=lambda x: x.name) + self.assertEqual(2, len(roots)) + self.assertEqual('nn.Module: Conv2', roots[0].name) + self.assertEqual('nn.Module: MyModule', roots[1].name) + self.assertEqual(1, len(roots[1].children)) + self.assertEqual('nn.Module: Linear', roots[1].children[0].name) + + +class TestDataPipe(unittest.TestCase): + + def test_datapipe(self): + json_content = """[ + { + "ph": "X", "cat": "cpu_op", + "name": "enumerate(DataPipe)#ShufflerIterDataPipe", "pid": 7557, "tid": 7557, + "ts": 100, "dur": 23, + "args": { + "External id": 34, + "Trace name": "PyTorch Profiler", "Trace iteration": 0 + } + } + ]""" + profile = parse_json_trace(json_content) + profile.process() + + dataloader_ranges = profile.role_ranges[ProfileRole.DataLoader] + datapipe_range = None + for range in dataloader_ranges: + if range[0] == 100 and range[1] == 123: + datapipe_range = range + break + self.assertTrue(datapipe_range is not None) + + root = next(iter(profile.tid2tree.values())) + ops, _ = root.get_operator_and_kernels() + datapipe_op = None + for op in ops: + if op.name.startswith('enumerate(DataPipe)'): + datapipe_op = op + break + + self.assertTrue(datapipe_op is None) + + +if __name__ == '__main__': + unittest.main() diff --git a/tb_plugins/profiling/tb_plugin/test/test_ranges.py b/tb_plugins/profiling/tb_plugin/test/test_ranges.py new file mode 100644 index 0000000000000000000000000000000000000000..d40fc3e02ce3dac96f4b87ec9336dc5c1bb37b36 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/test/test_ranges.py @@ -0,0 +1,50 @@ +import unittest +import math + +from torch_tb_profiler.profiler.overall_parser import ( + merge_ranges, subtract_ranges_lists, intersection_ranges_lists, get_ranges_sum +) + + +def check_ranges_equal(ranges1, ranges2): + if len(ranges1) != len(ranges2): + return False + for i in range(len(ranges1)): + if ranges1[i][0] != ranges2[i][0] or ranges1[i][1] != ranges2[i][1]: + return False + return True + + +class TestOverallParser(unittest.TestCase): + def test_merge_ranges(self): + src_ranges = [(1.1, 2.2), (1.5, 2.3), (3.3, 3.9), (3.5, 3.6), (3.7, 3.8), (4.1, 4.2)] + expected_ranges = [(1.1, 2.3), (3.3, 3.9), (4.1, 4.2)] + dst_ranges = merge_ranges(src_ranges, True) + is_equal = check_ranges_equal(dst_ranges, expected_ranges) + self.assertTrue(is_equal) + + def test_subtract_ranges_lists(self): + ranges1 = [(1.1, 2.2), (3.3, 4.4), (5.5, 6.6)] + ranges2 = [(0, 0.1), (1.0, 1.4), (1.5, 1.6), (1.9, 3.4), (4.3, 4.6)] + expected_ranges = [(1.4, 1.5), (1.6, 1.9), (3.4, 4.3), (5.5, 6.6)] + dst_ranges = subtract_ranges_lists(ranges1, ranges2) + is_equal = check_ranges_equal(dst_ranges, expected_ranges) + self.assertTrue(is_equal) + + def test_intersection_ranges_lists(self): + ranges1 = [(1.1, 2.2), (3.3, 4.4), (5.5, 6.6)] + ranges2 = [(0, 0.1), (1.0, 1.4), (1.5, 1.6), (1.9, 3.4), (4.3, 4.6)] + expected_ranges = [(1.1, 1.4), (1.5, 1.6), (1.9, 2.2), (3.3, 3.4), (4.3, 4.4)] + dst_ranges = intersection_ranges_lists(ranges1, ranges2) + is_equal = check_ranges_equal(dst_ranges, expected_ranges) + self.assertTrue(is_equal) + + def test_get_ranges_sum(self): + ranges = [(1.1, 2.2), (3.3, 4.4), (5.5, 6.6)] + expected_sum = 3.3 + dst_sum = get_ranges_sum(ranges) + self.assertTrue(math.isclose(dst_sum, expected_sum)) + + +if __name__ == '__main__': + unittest.main() diff --git a/tb_plugins/profiling/tb_plugin/test/test_tensorboard_end2end.py b/tb_plugins/profiling/tb_plugin/test/test_tensorboard_end2end.py new file mode 100644 index 0000000000000000000000000000000000000000..fae95b49050537b921e291a4771c63a6bff35690 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/test/test_tensorboard_end2end.py @@ -0,0 +1,170 @@ +import json +import os +import random +import shutil +import socket +import tempfile +import time +import unittest +import urllib +import urllib.request +from subprocess import Popen +from urllib.error import HTTPError + + +def get_samples_dir(): + return os.path.join(os.path.dirname(os.path.abspath(__file__)), '../samples') + + +class TestEnd2End(unittest.TestCase): + + # def test_tensorboard_gs(self): + # test_folder = 'gs://pe-tests-public/tb_samples/' + # expected_runs = b'["resnet50_profiler_api_num_workers_0", "resnet50_profiler_api_num_workers_4"]' + # self._test_tensorboard_with_arguments(test_folder, expected_runs, {'TORCH_PROFILER_START_METHOD':'spawn'}) + + def test_tensorboard_end2end(self): + test_folder = get_samples_dir() + expected_runs = b'["resnet50_num_workers_0", "resnet50_num_workers_4"]' + + print('starting spawn mode testing...') + self._test_tensorboard_with_arguments(test_folder, expected_runs, {'TORCH_PROFILER_START_METHOD': 'spawn'}) + + @unittest.skip('fork is not use anymore') + def test_tensorboard_fork(self): + test_folder = get_samples_dir() + expected_runs = b'["resnet50_num_workers_0", "resnet50_num_workers_4"]' + + print('starting fork mode testing') + self._test_tensorboard_with_arguments(test_folder, expected_runs) + + def test_tensorboard_with_path_prefix(self): + test_folder = get_samples_dir() + expected_runs = b'["resnet50_num_workers_0", "resnet50_num_workers_4"]' + self._test_tensorboard_with_arguments(test_folder, expected_runs, path_prefix='/tensorboard/viewer/') + + def test_tensorboard_with_symlinks(self): + logdir = tempfile.mkdtemp(prefix='tensorboard_logdir') + + samples_dir = get_samples_dir() + + # Create the following layout, with 1 symlink to a run dir, and 1 regular run dir: + # logdir/ + # run_concrete/ + # run_symlink/ --> path/to/samples/resnet50_num_workers_4/ + shutil.copytree(os.path.join(samples_dir, 'resnet50_num_workers_0'), os.path.join(logdir, 'run_concrete')) + os.symlink(os.path.join(samples_dir, 'resnet50_num_workers_4'), os.path.join(logdir, 'run_symlink')) + + expected_runs = b'["run_concrete", "run_symlink"]' + self._test_tensorboard_with_arguments(logdir, expected_runs) + + shutil.rmtree(logdir) + + def _test_tensorboard_with_arguments(self, test_folder, expected_runs, env=None, path_prefix=None): + host = 'localhost' + port = random.randint(6008, 65535) + + try: + if env: + env_copy = os.environ.copy() + env_copy.update(env) + env = env_copy + if not path_prefix: + tb = Popen(['tensorboard', '--logdir='+test_folder, '--port='+str(port)], env=env) + else: + tb = Popen(['tensorboard', '--logdir='+test_folder, '--port='+str(port), + '--path_prefix='+path_prefix], env=env) + self._test_tensorboard(host, port, expected_runs, path_prefix) + finally: + pid = tb.pid + print('tensorboard process {} is terminating.'.format(pid)) + tb.terminate() + + def _test_tensorboard(self, host, port, expected_runs, path_prefix): + if not path_prefix: + link_prefix = 'http://{}:{}/data/plugin/pytorch_profiler/'.format(host, port) + else: + path_prefix = path_prefix.strip('/') + link_prefix = 'http://{}:{}/{}/data/plugin/pytorch_profiler/'.format(host, port, path_prefix) + run_link = link_prefix + 'runs' + + expected_links_format = [ + link_prefix + 'overview?run={}&worker=worker0&span=1&view=Overview', + link_prefix + 'operation?run={}&worker=worker0&span=1&view=Operator&group_by=Operation', + link_prefix + 'operation/table?run={}&worker=worker0&span=1&view=Operator&group_by=Operation', + link_prefix + 'kernel/table?run={}&worker=worker0&span=1&view=Kernel&group_by=Kernel', + link_prefix + 'kernel?run={}&worker=worker0&span=1&view=Kernel&group_by=Kernel' + ] + + retry_times = 60 + while True: + try: + socket.socket(socket.AF_INET, socket.SOCK_STREAM).connect((host, port)) + print('tensorboard start successfully') + break + except socket.error: + time.sleep(2) + retry_times -= 1 + if retry_times < 0: + self.fail('tensorboard start timeout') + continue + + retry_times = 60 + + while True: + try: + response = urllib.request.urlopen(run_link) + data = response.read() + runs = None + if data: + data = json.loads(data) + runs = data.get('runs') + if runs: + runs = '[{}]'.format(', '.join(['"{}"'.format(i) for i in runs])) + runs = runs.encode('utf-8') + if runs == expected_runs: + break + if retry_times % 10 == 0: + print('receive mismatched data, retrying', data) + time.sleep(2) + retry_times -= 1 + if retry_times < 0: + self.fail('Load run timeout') + except Exception: + if retry_times > 0: + continue + else: + raise + + links = [] + for run in json.loads(expected_runs): + for expected_link in expected_links_format: + links.append(expected_link.format(run)) + + if os.environ.get('TORCH_PROFILER_REGEN_RESULT_CHECK') == '1': + with open('result_check_file.txt', 'w', encoding='utf-8') as f: + # NOTE: result_check_file.txt is manually generated and verified. + # And then checked-in so that we can make sure that frontend + # content change can be detected on code change. + for link in links: + response = urllib.request.urlopen(link) + f.write(response.read().decode('utf-8')) + f.write('\n') + else: + with open('result_check_file.txt', 'r') as f: + lines = f.readlines() + i = 0 + print('starting testing...') + for link in links: + try: + response = urllib.request.urlopen(link) + self.assertEqual(response.read(), lines[i].strip().encode(encoding='utf-8')) + i = i + 1 + except HTTPError as e: + self.fail(e) + self.assertEqual(i, 10) + print('ending testing...') + + +if __name__ == '__main__': + unittest.main() diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/__init__.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c94e71bf05f670cdbc1121ba1cec31150f68792b --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/__init__.py @@ -0,0 +1,7 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# -------------------------------------------------------------------------- + +# Entry point for Pytorch TensorBoard plugin package. + +__version__ = '0.4.0' diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/consts.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/consts.py new file mode 100644 index 0000000000000000000000000000000000000000..df881b05aba5400fbdead9b464a4951e99ea854a --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/consts.py @@ -0,0 +1,74 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# -------------------------------------------------------------------------- +import re +from collections import namedtuple + +PLUGIN_NAME = 'pytorch_profiler' + +WORKER_PATTERN = re.compile(r"""^(.*?) # worker name + (\.\d+)? # optional timestamp like 1619499959628 used as span name + \.pt\.trace\.json # the ending suffix + (?:\.gz)?$""", re.X) # optional .gz extension + +NODE_PROCESS_PATTERN = re.compile(r"""^(.*)_(\d+)""") +MONITOR_RUN_REFRESH_INTERNAL_IN_SECONDS = 10 +MAX_GPU_PER_NODE = 64 + +View = namedtuple('View', 'id, name, display_name') +OVERALL_VIEW = View(1, 'overall', 'Overview') +OP_VIEW = View(2, 'operator', 'Operator') +KERNEL_VIEW = View(3, 'kernel', 'Kernel') +TRACE_VIEW = View(4, 'trace', 'Trace') +DISTRIBUTED_VIEW = View(5, 'distributed', 'Distributed') +MEMORY_VIEW = View(6, 'memory', 'Memory') +MODULE_VIEW = View(7, 'module', 'Module') +LIGHTNING_VIEW = View(8, 'lightning', 'Lightning') + +TOOLTIP_GPU_UTIL = \ + 'GPU Utilization:\n' \ + 'GPU busy time / All steps time. The higher, the better. ' \ + 'GPU busy time is the time during which there is at least one GPU kernel running on it. ' \ + 'All steps time is the total time of all profiler steps(or called as iterations).\n' +TOOLTIP_SM_EFFICIENCY = \ + 'Est. SM Efficiency:\n' \ + 'Estimated Stream Multiprocessor Efficiency. The higher, the better. ' \ + 'This metric of a kernel, SM_Eff_K = min(blocks of this kernel / SM number of this GPU, 100%). ' \ + "This overall number is the sum of all kernels' SM_Eff_K weighted by kernel's execution duration, " \ + 'divided by all steps time.\n' +TOOLTIP_OCCUPANCY_COMMON = \ + 'Est. Achieved Occupancy:\n' \ + 'For most cases such as memory bandwidth bounded kernels, the higher the better. ' \ + 'Occupancy is the ratio of active warps on an SM ' \ + 'to the maximum number of active warps supported by the SM. ' \ + 'The theoretical occupancy of a kernel is upper limit occupancy of this kernel, ' \ + 'limited by multiple factors such as kernel shape, kernel used resource, ' \ + 'and the GPU compute capability.\n' \ + 'Est. Achieved Occupancy of a kernel, OCC_K = ' \ + 'min(threads of the kernel / SM number / max threads per SM, theoretical occupancy of the kernel). ' +TOOLTIP_OCCUPANCY_OVERVIEW = \ + "This overall number is the weighted average of all kernels' OCC_K " \ + "using kernel's execution duration as weight. " \ + 'It shows fine-grained low-level GPU utilization.\n' +TOOLTIP_TENSOR_CORES = \ + 'Kernel using Tensor Cores:\n' \ + 'Total GPU Time for Tensor Core kernels / Total GPU Time for all kernels.\n' +TOOLTIP_OCCUPANCY_TABLE = \ + "This \"Mean\" number is the weighted average of all calls' OCC_K of the kernel, " \ + "using each call's execution duration as weight. " \ + 'It shows fine-grained low-level GPU utilization.' +TOOLTIP_BLOCKS_PER_SM = \ + 'Blocks Per SM = blocks of this kernel / SM number of this GPU.\n' \ + 'If this number is less than 1, it indicates the GPU multiprocessors are not fully utilized.\n' \ + '\"Mean Blocks per SM\" is the weighted average of all calls of this kernel, ' \ + "using each call's execution duration as weight." +TOOLTIP_OP_TC_ELIGIBLE = \ + 'Whether this operator is eligible to use Tensor Cores.' +TOOLTIP_OP_TC_SELF = \ + 'Time of self-kernels with Tensor Cores / Time of self-kernels.' +TOOLTIP_OP_TC_TOTAL = \ + 'Time of kernels with Tensor Cores / Time of kernels.' +TOOLTIP_KERNEL_USES_TC = \ + 'Whether this kernel uses Tensor Cores.' +TOOLTIP_KERNEL_OP_TC_ELIGIBLE = \ + 'Whether the operator launched this kernel is eligible to use Tensor Cores.' diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/io/__init__.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/io/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a3807634fdd0c927a25153a01d700088a14fa68d --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/io/__init__.py @@ -0,0 +1,4 @@ +from .cache import Cache +from .file import (BaseFileSystem, StatData, abspath, basename, download_file, + exists, get_filesystem, glob, isdir, join, listdir, + makedirs, read, register_filesystem, relpath, walk) diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/io/azureblob.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/io/azureblob.py new file mode 100644 index 0000000000000000000000000000000000000000..b0ac49a655fd3d999ea80dfc3e6fa62e33fc5269 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/io/azureblob.py @@ -0,0 +1,187 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# ------------------------------------------------------------------------- +import os + +from azure.storage.blob import ContainerClient + +from .. import utils +from .base import BaseFileSystem, RemotePath, StatData +from .utils import as_bytes, as_text, parse_blob_url + +logger = utils.get_logger() + + +class AzureBlobSystem(RemotePath, BaseFileSystem): + """Provides filesystem access to S3.""" + + def __init__(self): + if not ContainerClient: + raise ImportError('azure-storage-blob must be installed for Azure Blob support.') + self.connection_string = os.environ.get('AZURE_STORAGE_CONNECTION_STRING', None) + + def exists(self, dirname): + """Returns whether the path is a directory or not.""" + basename, parts = self.split_blob_path(dirname) + if basename is None or parts is None: + return False + if basename == '': + # root container case + return True + else: + return basename == parts[0] + + def read(self, filename, binary_mode=False, size=None, continue_from=None): + """Reads contents of a file to a string.""" + logger.info('azure blob: starting reading file %s' % filename) + account, container, path = self.container_and_path(filename) + client = self.create_container_client(account, container) + blob_client = client.get_blob_client(path) + if not blob_client.exists(): + raise FileNotFoundError("file %s doesn't exist!" % path) + + downloader = blob_client.download_blob(offset=continue_from, length=size) + if continue_from is not None: + continuation_token = continue_from + downloader.size + else: + continuation_token = downloader.size + + data = downloader.readall() + logger.info('azure blob: file %s download is done, size is %d' % (filename, len(data))) + if binary_mode: + return as_bytes(data), continuation_token + else: + return as_text(data), continuation_token + + def write(self, filename, file_content, binary_mode=False): + """Writes string file contents to a file.""" + account, container, path = self.container_and_path(filename) + client = self.create_container_client(account, container) + + if binary_mode: + if not isinstance(file_content, bytes): + raise TypeError('File content type must be bytes') + else: + file_content = as_bytes(file_content) + client.upload_blob(path, file_content) + + def download_file(self, file_to_download, file_to_save): + logger.info('azure blob: starting downloading file %s as %s' % (file_to_download, file_to_save)) + account, container, path = self.container_and_path(file_to_download) + client = self.create_container_client(account, container) + blob_client = client.get_blob_client(path) + if not blob_client.exists(): + raise FileNotFoundError("file %s doesn't exist!" % path) + + downloader = blob_client.download_blob() + with open(file_to_save, 'wb') as downloaded_file: + data = downloader.readall() + downloaded_file.write(data) + logger.info('azure blob: file %s is downloaded as %s, size is %d' % + (file_to_download, file_to_save, len(data))) + + def glob(self, filename): + """Returns a list of files that match the given pattern(s).""" + # Only support prefix with * at the end and no ? in the string + star_i = filename.find('*') + quest_i = filename.find('?') + if quest_i >= 0: + raise NotImplementedError( + '{} not supported by compat glob'.format(filename) + ) + if star_i != len(filename) - 1: + return [] + + filename = filename[:-1] + + account, container, path = self.container_and_path(filename) + client = self.create_container_client(account, container) + blobs = client.list_blobs(name_starts_with=path) + return [blob.name for blob in blobs] + + def isdir(self, dirname): + """Returns whether the path is a directory or not.""" + basename, parts = self.split_blob_path(dirname) + if basename is None or parts is None: + return False + if basename == '': + # root container case + return True + else: + return basename == parts[0] and len(parts) > 1 + + def listdir(self, dirname): + """Returns a list of entries contained within a directory.""" + account, container, path = self.container_and_path(dirname) + client = self.create_container_client(account, container) + blob_iter = client.list_blobs(name_starts_with=path) + items = [] + for blob in blob_iter: + item = self.relpath(blob.name, path) + if items not in items: + items.append(item) + return items + + def makedirs(self, dirname): + """No need create directory since the upload blob will automatically create""" + pass + + def stat(self, filename): + """Returns file statistics for a given path.""" + account, container, path = self.container_and_path(filename) + client = self.create_container_client(account, container) + blob_client = client.get_blob_client(path) + props = blob_client.get_blob_properties() + return StatData(props.size) + + def walk(self, top, topdown=True, onerror=None): + account, container, path = self.container_and_path(top) + client = self.create_container_client(account, container) + blobs = client.list_blobs(name_starts_with=path) + results = {} + for blob in blobs: + dirname, basename = self.split(blob.name) + dirname = 'https://{}/{}/{}'.format(account, container, dirname) + results.setdefault(dirname, []).append(basename) + for key, value in results.items(): + yield key, None, value + + def split_blob_path(self, blob_path): + """ Find the first blob start with blob_path, then get the relative path starting from dirname(blob_path). + Finally, split the relative path. + return (basename(blob_path), [relative splitted paths]) + If blob_path doesn't exist, return (None, None) + For example, + For blob https://trainingdaemon.blob.core.windows.net/tests/test1/test2/test.txt + * If the blob_path is '', return ('', [test1, test2, test.txt]) + * If the blob_path is test1, return (test1, [test2, test.txt]) + * If the blob_path is test1/test2, return (test2, [test2, test.txt]) + * If the blob_path is test1/test2/test.txt, return (test.txt, [test.txt]) + """ + account, container, path = self.container_and_path(blob_path) + client = self.create_container_client(account, container) + blobs = client.list_blobs(name_starts_with=path, maxresults=1) + + for blob in blobs: + dir_path, basename = self.split(path) + if dir_path: + rel_path = blob.name[len(dir_path):] + parts = rel_path.lstrip('/').split('/') + else: + parts = blob.name.split('/') + return (basename, parts) + return (None, None) + + def container_and_path(self, url): + """Split an Azure blob -prefixed URL into container and blob path.""" + root, parts = parse_blob_url(url) + if len(parts) != 2: + raise ValueError('Invalid azure blob url %s' % url) + return root, parts[0], parts[1] + + def create_container_client(self, account, container): + if self.connection_string: + client = ContainerClient.from_connection_string(self.connection_string, container) + else: + client = ContainerClient.from_container_url('https://{}/{}'.format(account, container)) + return client diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/io/base.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/io/base.py new file mode 100644 index 0000000000000000000000000000000000000000..d6f0a2f92200a0db0db9ea1b25d0099e6f6043a2 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/io/base.py @@ -0,0 +1,112 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# ------------------------------------------------------------------------- +import os +from abc import ABC, abstractmethod +from collections import namedtuple + +# Data returned from the Stat call. +StatData = namedtuple('StatData', ['length']) + + +class BaseFileSystem(ABC): + def support_append(self): + return False + + def append(self, filename, file_content, binary_mode=False): + pass + + def download_file(self, file_to_download, file_to_save): + pass + + @abstractmethod + def exists(self, filename): + raise NotImplementedError + + @abstractmethod + def read(self, file, binary_mode=False, size=None, continue_from=None): + raise NotImplementedError + + @abstractmethod + def write(self, filename, file_content, binary_mode=False): + raise NotImplementedError + + @abstractmethod + def glob(self, filename): + raise NotImplementedError + + @abstractmethod + def isdir(self, dirname): + raise NotImplementedError + + @abstractmethod + def listdir(self, dirname): + raise NotImplementedError + + @abstractmethod + def makedirs(self, path): + raise NotImplementedError + + @abstractmethod + def stat(self, filename): + raise NotImplementedError + + +class BasePath(ABC): + @abstractmethod + def join(self, path, *paths): + pass + + @abstractmethod + def abspath(self, path): + pass + + @abstractmethod + def basename(self, path): + pass + + @abstractmethod + def relpath(self, path, start): + pass + + +class LocalPath(BasePath): + def abspath(self, path): + return os.path.abspath(os.path.expanduser(os.path.expandvars(path))) + + def basename(self, path): + return os.path.basename(path) + + def relpath(self, path, start): + return os.path.relpath(path, start) + + def join(self, path, *paths): + return os.path.join(path, *paths) + + +class RemotePath(BasePath): + def split(self, path): + """Split a pathname. Returns tuple '(head, tail)' where 'tail' is + everything after the final slash. Either part may be empty.""" + sep = '/' + i = path.rfind(sep) + 1 + head, tail = path[:i], path[i:] + head = head.rstrip(sep) + return (head, tail) + + def join(self, path, *paths): + """Join paths with a slash.""" + return '/'.join((path,) + paths) + + def abspath(self, path): + return path + + def basename(self, path): + return path.split('/')[-1] + + def relpath(self, path, start): + if not path.startswith(start): + return path + start = start.rstrip('/') + begin = len(start) + 1 # include the ending slash '/' + return path[begin:] diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/io/cache.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/io/cache.py new file mode 100644 index 0000000000000000000000000000000000000000..ea9afab669d79885227e8c0dd165721a73a124bb --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/io/cache.py @@ -0,0 +1,81 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# ------------------------------------------------------------------------- +import tempfile + +from .. import utils +from .. import multiprocessing as mp +from . import file +from .file import basename, is_local, download_file, read + +logger = utils.get_logger() + + +class Cache: + def __init__(self, cache_dir=None): + self._lock = mp.Lock() + self._manager = mp.Manager() + self._cache_dict = self._manager.dict() + self._cache_dir = cache_dir + + def __getstate__(self): + """The multiprocessing module can start one of three ways: spawn, fork, or forkserver. + The default mode is fork in Unix and spawn on Windows and macOS. + Therefore, the __getstate__ and __setstate__ are used to pickle/unpickle the state in spawn mode. + """ + data = self.__dict__.copy() + # remove the _manager to bypass the following pickle error + # TypeError: cannot pickle 'weakref' object + if hasattr(self, '_manager'): + del data['_manager'] + logger.debug('Cache.__getstate__: %s ' % data) + return data, file._REGISTERED_FILESYSTEMS + + def __setstate__(self, state): + """The default logging level in new process is warning. Only warning and error log can be written to + streams. + So, we need call use_absl_handler in the new process. + """ + from absl import logging + logging.use_absl_handler() + logger.debug('Cache.__setstate__ %s ' % (state,)) + data, file._REGISTERED_FILESYSTEMS = state + self.__dict__.update(data) + + def read(self, filename): + local_file = self.get_remote_cache(filename) + return read(local_file) + + @property + def cache_dir(self): + return self._cache_dir + + def get_remote_cache(self, filename): + """Try to get the local file in the cache. download it to local if it cannot be found in cache.""" + local_file = self.get_file(filename) + if local_file is None: + if is_local(filename): + return filename + else: + local_file = tempfile.NamedTemporaryFile( + 'w+t', suffix='.%s' % basename(filename), dir=self._cache_dir, delete=False) + local_file.close() + download_file(filename, local_file.name) + self.add_file(filename, local_file.name) + return local_file.name + + return local_file + + def get_file(self, filename): + return self._cache_dict.get(filename) + + def add_file(self, source_file, local_file): + with self._lock: + logger.debug('add local cache %s for file %s' % (local_file, source_file)) + self._cache_dict[source_file] = local_file + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self._manager.__exit__(exc_type, exc_value, traceback) diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/io/file.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/io/file.py new file mode 100644 index 0000000000000000000000000000000000000000..97e4902f8fae6f614fcd6953b06e9af19cf0f529 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/io/file.py @@ -0,0 +1,622 @@ +""" +This file is forked from +https://github.com/tensorflow/tensorboard/blob/master/tensorboard/compat/tensorflow_stub/io/gfile.py. +The following functionalities are added after forking: +* Check Azure Blob & Google Cloud available or not +* get_filesystem changes to support Azure Blobs +* add BaseFileSystem and PathBase abstracted class for the filesystem. +* add download_file for each file system to cache the remote file to local temporary folder. +* add AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY for S3 file system which is not supported by tensorboard. +* add Azure blob file system +* add Google Cloud file system +* add specialized walk for Local file system, Azure Blob and Google Cloud to improve the walk performance. +* add global wrapper for abspath, basename, join, download_file. +* change the global walk wrapper to support specialized walk. +""" +import glob as py_glob +import os +import tempfile + +from .. import utils +from .base import BaseFileSystem, LocalPath, RemotePath, StatData +from .utils import as_bytes, as_text, parse_blob_url + +logger = utils.get_logger() + +try: + import boto3 + import botocore.exceptions + + S3_ENABLED = True +except ImportError: + S3_ENABLED = False + +try: + from azure.storage.blob import ContainerClient + BLOB_ENABLED = True +except ImportError: + BLOB_ENABLED = False + +try: + # Imports the Google Cloud client library + from google.cloud import storage + GS_ENABLED = True +except ImportError: + GS_ENABLED = False + + +_DEFAULT_BLOCK_SIZE = 16 * 1024 * 1024 + +# Registry of filesystems by prefix. +# +# Currently supports: +# * "s3://" URLs for S3 based on boto3 +# * "https://.blob.core.windows.net" for Azure Blob based on azure-storage-blob +# * "gs://" URLs for Google Cloud based on google-cloud-storage +# * Local filesystem when not match any prefix. +_REGISTERED_FILESYSTEMS = {} + + +def register_filesystem(prefix, filesystem): + if ":" in prefix: + raise ValueError("Filesystem prefix cannot contain a :") + _REGISTERED_FILESYSTEMS[prefix] = filesystem + + +def get_filesystem(filename): + """Return the registered filesystem for the given file.""" + prefix = "" + index = filename.find("://") + if index >= 0: + prefix = filename[:index] + if prefix.upper() in ('HTTP', 'HTTPS'): + root, _ = parse_blob_url(filename) + if root.lower().endswith('.blob.core.windows.net'): + fs = _REGISTERED_FILESYSTEMS.get('blob', None) + else: + raise ValueError("Not supported file system for prefix %s" % root) + else: + fs = _REGISTERED_FILESYSTEMS.get(prefix, None) + if fs is None: + raise ValueError("No recognized filesystem for prefix %s" % prefix) + return fs + + +class LocalFileSystem(LocalPath, BaseFileSystem): + def __init__(self): + pass + + def exists(self, filename): + return os.path.exists(filename) + + def read(self, filename, binary_mode=False, size=None, continue_from=None): + mode = "rb" if binary_mode else "r" + encoding = None if binary_mode else "utf8" + if not self.exists(filename): + raise FileNotFoundError(filename) + + offset = None + if continue_from is not None: + offset = continue_from.get("opaque_offset", None) + with open(filename, mode, encoding=encoding) as f: + if offset is not None: + f.seek(offset) + data = f.read(size) + # The new offset may not be `offset + len(data)`, due to decoding + # and newline translation. + # So, just measure it in whatever terms the underlying stream uses. + continuation_token = {"opaque_offset": f.tell()} + return (data, continuation_token) + + def write(self, filename, file_content, binary_mode=False): + """Writes string file contents to a file, overwriting any existing contents. + """ + self._write(filename, file_content, "wb" if binary_mode else "w") + + def support_append(self): + return True + + def append(self, filename, file_content, binary_mode=False): + """Append string file contents to a file. + """ + self._write(filename, file_content, "ab" if binary_mode else "a") + + def _write(self, filename, file_content, mode): + encoding = None if "b" in mode else "utf8" + with open(filename, mode, encoding=encoding) as f: + compatify = as_bytes if "b" in mode else as_text + f.write(compatify(file_content)) + + def glob(self, filename): + """Returns a list of files that match the given pattern(s).""" + if isinstance(filename, str): + return [ + matching_filename + for matching_filename in py_glob.glob(filename) + ] + else: + return [ + matching_filename + for single_filename in filename + for matching_filename in py_glob.glob(single_filename) + ] + + def isdir(self, dirname): + return os.path.isdir(dirname) + + def listdir(self, dirname): + entries = os.listdir(dirname) + entries = [item for item in entries] + return entries + + def makedirs(self, path): + os.makedirs(path, exist_ok=True) + + def stat(self, filename): + """Returns file statistics for a given path.""" + # NOTE: Size of the file is given by .st_size as returned from + # os.stat(), but we convert to .length + file_length = os.stat(filename).st_size + return StatData(file_length) + + def walk(self, top, topdown=True, onerror=None): + # Note on followlinks=True: per the tensorboard documentation [1], users are encouraged to + # use symlink trees to have fine-grained control over the filesystem layout of runs. To + # support such trees, we must follow links. + # [1] https://github.com/tensorflow/tensorboard/blob/master/README.md#logdir--logdir_spec-legacy-mode + yield from os.walk(top, topdown, onerror, followlinks=True) + + +class S3FileSystem(RemotePath, BaseFileSystem): + """Provides filesystem access to S3.""" + + def __init__(self): + if not boto3: + raise ImportError("boto3 must be installed for S3 support.") + self._s3_endpoint = os.environ.get("S3_ENDPOINT", None) + access_key = os.environ.get("AWS_ACCESS_KEY_ID") + secret_key = os.environ.get("AWS_SECRET_ACCESS_KEY") + if access_key and secret_key: + boto3.setup_default_session( + aws_access_key_id=access_key, aws_secret_access_key=secret_key) + + def bucket_and_path(self, url): + """Split an S3-prefixed URL into bucket and path.""" + if url.startswith("s3://"): + url = url[len("s3://"):] + idx = url.index("/") + bucket = url[:idx] + path = url[(idx + 1):] + return bucket, path + + def exists(self, filename): + """Determines whether a path exists or not.""" + client = boto3.client("s3", endpoint_url=self._s3_endpoint) + bucket, path = self.bucket_and_path(filename) + r = client.list_objects(Bucket=bucket, Prefix=path, Delimiter="/") + if r.get("Contents") or r.get("CommonPrefixes"): + return True + return False + + def read(self, filename, binary_mode=False, size=None, continue_from=None): + """Reads contents of a file to a string.""" + s3 = boto3.resource("s3", endpoint_url=self._s3_endpoint) + bucket, path = self.bucket_and_path(filename) + args = {} + + # S3 use continuation tokens of the form: {byte_offset: number} + offset = 0 + if continue_from is not None: + offset = continue_from.get("byte_offset", 0) + + endpoint = "" + if size is not None: + endpoint = offset + size + + if offset != 0 or endpoint != "": + args["Range"] = "bytes={}-{}".format(offset, endpoint) + + logger.info("s3: starting reading file %s" % filename) + try: + stream = s3.Object(bucket, path).get(**args)["Body"].read() + except botocore.exceptions.ClientError as exc: + if exc.response["Error"]["Code"] in ["416", "InvalidRange"]: + if size is not None: + # Asked for too much, so request just to the end. Do this + # in a second request so we don't check length in all cases. + client = boto3.client("s3", endpoint_url=self._s3_endpoint) + obj = client.head_object(Bucket=bucket, Key=path) + content_length = obj["ContentLength"] + endpoint = min(content_length, offset + size) + if offset == endpoint: + # Asked for no bytes, so just return empty + stream = b"" + else: + args["Range"] = "bytes={}-{}".format(offset, endpoint) + stream = s3.Object(bucket, path).get(**args)["Body"].read() + else: + raise + + logger.info("s3: file %s download is done, size is %d" % + (filename, len(stream))) + # `stream` should contain raw bytes here (i.e., there has been neither decoding nor newline translation), + # so the byte offset increases by the expected amount. + continuation_token = {"byte_offset": (offset + len(stream))} + if binary_mode: + return (bytes(stream), continuation_token) + else: + return (stream.decode("utf-8"), continuation_token) + + def write(self, filename, file_content, binary_mode=False): + """Writes string file contents to a file.""" + client = boto3.client("s3", endpoint_url=self._s3_endpoint) + bucket, path = self.bucket_and_path(filename) + if binary_mode: + if not isinstance(file_content, bytes): + raise TypeError("File content type must be bytes") + else: + file_content = as_bytes(file_content) + client.put_object(Body=file_content, Bucket=bucket, Key=path) + + def download_file(self, file_to_download, file_to_save): + logger.info("s3: starting downloading file %s as %s" % + (file_to_download, file_to_save)) + # Use boto3.resource instead of boto3.client('s3') to support minio. + # https://docs.min.io/docs/how-to-use-aws-sdk-for-python-with-minio-server.html + # To support minio, the S3_ENDPOINT need to be set like: S3_ENDPOINT=http://localhost:9000 + s3 = boto3.resource("s3", endpoint_url=self._s3_endpoint) + bucket, path = self.bucket_and_path(file_to_download) + s3.Bucket(bucket).download_file(path, file_to_save) + logger.info("s3: file %s is downloaded as %s" % (file_to_download, file_to_save)) + return + + def glob(self, filename): + """Returns a list of files that match the given pattern(s).""" + # Only support prefix with * at the end and no ? in the string + star_i = filename.find("*") + quest_i = filename.find("?") + if quest_i >= 0: + raise NotImplementedError("{} not supported".format(filename)) + if star_i != len(filename) - 1: + return [] + + filename = filename[:-1] + client = boto3.client("s3", endpoint_url=self._s3_endpoint) + bucket, path = self.bucket_and_path(filename) + p = client.get_paginator("list_objects") + keys = [] + for r in p.paginate(Bucket=bucket, Prefix=path): + for o in r.get("Contents", []): + key = o["Key"][len(path):] + if key: + keys.append(filename + key) + return keys + + def isdir(self, dirname): + """Returns whether the path is a directory or not.""" + client = boto3.client("s3", endpoint_url=self._s3_endpoint) + bucket, path = self.bucket_and_path(dirname) + if not path.endswith("/"): + path += "/" + r = client.list_objects(Bucket=bucket, Prefix=path, Delimiter="/") + if r.get("Contents") or r.get("CommonPrefixes"): + return True + return False + + def listdir(self, dirname): + """Returns a list of entries contained within a directory.""" + client = boto3.client("s3", endpoint_url=self._s3_endpoint) + bucket, path = self.bucket_and_path(dirname) + p = client.get_paginator("list_objects") + if not path.endswith("/"): + path += "/" + keys = [] + for r in p.paginate(Bucket=bucket, Prefix=path, Delimiter="/"): + keys.extend( + o["Prefix"][len(path): -1] for o in r.get("CommonPrefixes", []) + ) + for o in r.get("Contents", []): + key = o["Key"][len(path):] + if key: + keys.append(key) + return keys + + def makedirs(self, dirname): + """Creates a directory and all parent/intermediate directories.""" + if not self.exists(dirname): + client = boto3.client("s3", endpoint_url=self._s3_endpoint) + bucket, path = self.bucket_and_path(dirname) + if not path.endswith("/"): + path += "/" + client.put_object(Body="", Bucket=bucket, Key=path) + + def stat(self, filename): + """Returns file statistics for a given path.""" + # Size of the file is given by ContentLength from S3 + client = boto3.client("s3", endpoint_url=self._s3_endpoint) + bucket, path = self.bucket_and_path(filename) + + obj = client.head_object(Bucket=bucket, Key=path) + return StatData(obj["ContentLength"]) + + +register_filesystem("", LocalFileSystem()) +if S3_ENABLED: + register_filesystem("s3", S3FileSystem()) + +if BLOB_ENABLED: + from .azureblob import AzureBlobSystem + register_filesystem("blob", AzureBlobSystem()) + +if GS_ENABLED: + from .gs import GoogleBlobSystem + register_filesystem("gs", GoogleBlobSystem()) + + +class File(object): + def __init__(self, filename, mode): + if mode not in ("r", "rb", "br", "w", "wb", "bw"): + raise ValueError("mode {} not supported by File".format(mode)) + self.filename = filename + self.fs = get_filesystem(self.filename) + self.fs_supports_append = self.fs.support_append() + self.buff = None + self.buff_chunk_size = _DEFAULT_BLOCK_SIZE + self.buff_offset = 0 + self.continuation_token = None + self.write_temp = None + self.write_started = False + self.binary_mode = "b" in mode + self.write_mode = "w" in mode + self.closed = False + + def __enter__(self): + return self + + def __exit__(self, *args): + self.close() + self.buff = None + self.buff_offset = 0 + self.continuation_token = None + + def __iter__(self): + return self + + def _read_buffer_to_offset(self, new_buff_offset): + old_buff_offset = self.buff_offset + read_size = min(len(self.buff), new_buff_offset) - old_buff_offset + self.buff_offset += read_size + return self.buff[old_buff_offset: old_buff_offset + read_size] + + def read(self, n=None): + """Reads contents of file to a string. + + Args: + n: int, number of bytes or characters to read, otherwise + read all the contents of the file + + Returns: + Subset of the contents of the file as a string or bytes. + """ + if self.write_mode: + raise OSError("File not opened in read mode") + + result = None + if self.buff and len(self.buff) > self.buff_offset: + # read from local buffer + if n is not None: + chunk = self._read_buffer_to_offset(self.buff_offset + n) + if len(chunk) == n: + return chunk + result = chunk + n -= len(chunk) + else: + # add all local buffer and update offsets + result = self._read_buffer_to_offset(len(self.buff)) + + # read from filesystem + read_size = max(self.buff_chunk_size, n) if n is not None else None + (self.buff, self.continuation_token) = self.fs.read( + self.filename, self.binary_mode, read_size, self.continuation_token) + self.buff_offset = 0 + + # add from filesystem + if n is not None: + chunk = self._read_buffer_to_offset(n) + else: + # add all local buffer and update offsets + chunk = self._read_buffer_to_offset(len(self.buff)) + result = result + chunk if result else chunk + + return result + + def write(self, file_content): + """Writes string file contents to file, clearing contents of the file + on first write and then appending on subsequent calls. + """ + if not self.write_mode: + raise OSError("File not opened in write mode") + + if self.closed: + raise OSError("File already closed") + + if self.fs_supports_append: + if not self.write_started: + # write the first chunk to truncate file if it already exists + self.fs.write(self.filename, file_content, self.binary_mode) + self.write_started = True + else: + # append the later chunks + self.fs.append(self.filename, file_content, self.binary_mode) + else: + # add to temp file, but wait for flush to write to final filesystem + if self.write_temp is None: + mode = "w+b" if self.binary_mode else "w+" + self.write_temp = tempfile.TemporaryFile(mode) + + compatify = as_bytes if self.binary_mode else as_text + self.write_temp.write(compatify(file_content)) + + def __next__(self): + line = None + while True: + if not self.buff: + # read one unit into the buffer + line = self.read(1) + if line and (line[-1] == "\n" or not self.buff): + return line + if not self.buff: + raise StopIteration() + else: + index = self.buff.find("\n", self.buff_offset) + if index != -1: + # include line until now plus newline + chunk = self.read(index + 1 - self.buff_offset) + line = line + chunk if line else chunk + return line + + # read one unit past end of buffer + chunk = self.read(len(self.buff) + 1 - self.buff_offset) + line = line + chunk if line else chunk + if line and (line[-1] == "\n" or not self.buff): + return line + if not self.buff: + raise StopIteration() + + def next(self): + return self.__next__() + + def flush(self): + if self.closed: + raise OSError("File already closed") + + if not self.fs_supports_append: + if self.write_temp is not None: + # read temp file from the beginning + self.write_temp.flush() + self.write_temp.seek(0) + chunk = self.write_temp.read() + if chunk is not None: + # write full contents and keep in temp file + self.fs.write(self.filename, chunk, self.binary_mode) + self.write_temp.seek(len(chunk)) + + def close(self): + self.flush() + if self.write_temp is not None: + self.write_temp.close() + self.write_temp = None + self.write_started = False + self.closed = True + + +def exists(filename): + """Determines whether a path exists or not.""" + return get_filesystem(filename).exists(filename) + + +def abspath(path): + return get_filesystem(path).abspath(path) + + +def basename(path): + return get_filesystem(path).basename(path) + + +def relpath(path, start): + return get_filesystem(path).relpath(path, start) + + +def join(path, *paths): + return get_filesystem(path).join(path, *paths) + + +def download_file(file_to_download, file_to_save): + """Downloads the file, returning a temporary path to the file after finishing.""" + get_filesystem(file_to_download).download_file(file_to_download, file_to_save) + + +def glob(filename): + """Returns a list of files that match the given pattern(s).""" + return get_filesystem(filename).glob(filename) + + +def is_local(path): + """Returns whether the path is a local path""" + return isinstance(get_filesystem(path), LocalFileSystem) + + +def isdir(dirname): + """Returns whether the path is a directory or not.""" + return get_filesystem(dirname).isdir(dirname) + + +def listdir(dirname): + """Returns a list of entries contained within a directory. + + The list is in arbitrary order. It does not contain the special entries "." + and "..". + """ + return get_filesystem(dirname).listdir(dirname) + + +def makedirs(path): + """Creates a directory and all parent/intermediate directories.""" + return get_filesystem(path).makedirs(path) + + +def walk(top, topdown=True, onerror=None): + """Recursive directory tree generator for directories. + + Args: + top: string, a Directory name + topdown: bool, Traverse pre order if True, post order if False. + onerror: optional handler for errors. Should be a function, it will be + called with the error as argument. Rethrowing the error aborts the walk. + + Errors that happen while listing directories are ignored. + + Yields: + Each yield is a 3-tuple: the pathname of a directory, followed by lists + of all its subdirectories and leaf files. + (dirname, [subdirname, subdirname, ...], [filename, filename, ...]) + as strings + """ + fs = get_filesystem(top) + if hasattr(fs, "walk"): + yield from fs.walk(top, topdown, onerror) + else: + top = fs.abspath(top) + listing = fs.listdir(top) + + files = [] + subdirs = [] + for item in listing: + full_path = fs.join(top, item) + if fs.isdir(full_path): + subdirs.append(item) + else: + files.append(item) + + here = (top, subdirs, files) + + if topdown: + yield here + + for subdir in subdirs: + joined_subdir = fs.join(top, subdir) + for subitem in walk(joined_subdir, topdown, onerror=onerror): + yield subitem + + if not topdown: + yield here + + +def stat(filename): + """Returns file statistics for a given path.""" + return get_filesystem(filename).stat(filename) + + +def read(file): + with File(file, 'rb') as f: + return f.read() diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/io/gs.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/io/gs.py new file mode 100644 index 0000000000000000000000000000000000000000..df1676f070c19db7a08a7a126d1df07e40ff9055 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/io/gs.py @@ -0,0 +1,127 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# ------------------------------------------------------------------------- +from google.cloud import storage + +from .. import utils +from .base import BaseFileSystem, RemotePath, StatData + +logger = utils.get_logger() + + +class GoogleBlobSystem(RemotePath, BaseFileSystem): + """Provides filesystem access to S3.""" + + def __init__(self): + if not storage: + raise ImportError('google-cloud-storage must be installed for Google Cloud Blob support.') + + def exists(self, dirname): + """Returns whether the path is a directory or not.""" + bucket_name, path = self.bucket_and_path(dirname) + client = self.create_google_cloud_client() + bucket = client.bucket(bucket_name) + return bucket.blob(path).exists() + + def read(self, filename, binary_mode=False, size=None, continue_from=None): + raise NotImplementedError + + def write(self, filename, file_content, binary_mode=False): + raise NotImplementedError + + def glob(self, filename): + raise NotImplementedError + + def download_file(self, file_to_download, file_to_save): + bucket_name, path = self.bucket_and_path(file_to_download) + client = self.create_google_cloud_client() + bucket = client.bucket(bucket_name) + blob = bucket.blob(path) + blob.download_to_filename(file_to_save) + + def isdir(self, dirname): + """Returns whether the path is a directory or not.""" + basename, parts = self.split_blob_path(dirname) + if basename is None or parts is None: + return False + if basename == '': + # root container case + return True + else: + return basename == parts[0] and len(parts) > 1 + + def listdir(self, dirname): + """Returns a list of entries contained within a directory.""" + bucket_name, path = self.bucket_and_path(dirname) + client = self.create_google_cloud_client() + blobs = client.list_blobs(bucket_name, prefix=path) + items = [] + for blob in blobs: + item = self.relpath(blob.name, path) + if items not in items: + items.append(item) + return items + + def makedirs(self, dirname): + """No need create directory since the upload blob will automatically create""" + pass + + def stat(self, filename): + """Returns file statistics for a given path.""" + bucket_name, path = self.bucket_and_path(filename) + client = self.create_google_cloud_client() + bucket = client.bucket(bucket_name) + blob = bucket.get_blob(path) + return StatData(blob.size) + + def walk(self, top, topdown=True, onerror=None): + bucket_name, path = self.bucket_and_path(top) + client = self.create_google_cloud_client() + blobs = client.list_blobs(bucket_name, prefix=path) + results = {} + for blob in blobs: + dirname, basename = self.split(blob.name) + dirname = 'gs://{}/{}'.format(bucket_name, dirname) + results.setdefault(dirname, []).append(basename) + for key, value in results.items(): + yield key, None, value + + def split_blob_path(self, blob_path): + """ Find the first blob start with blob_path, then get the relative path starting from dirname(blob_path). + Finally, split the relative path. + return (basename(blob_path), [relative splitted paths]) + If blob_path doesn't exist, return (None, None) + For example, + For blob gs://tests/test1/test2/test.txt + * If the blob_path is '', return ('', [test1, test2, test.txt]) + * If the blob_path is test1, return (test1, [test2, test.txt]) + * If the blob_path is test1/test2, return (test2, [test2, test.txt]) + * If the blob_path is test1/test2/test.txt, return (test.txt, [test.txt]) + """ + bucket_name, path = self.bucket_and_path(blob_path) + client = self.create_google_cloud_client() + blobs = client.list_blobs(bucket_name, prefix=path, delimiter=None, max_results=1) + + for blob in blobs: + dir_path, basename = self.split(path) + if dir_path: + rel_path = blob.name[len(dir_path):] + parts = rel_path.lstrip('/').split('/') + else: + parts = blob.name.split('/') + return (basename, parts) + return (None, None) + + def bucket_and_path(self, url): + """Split an S3-prefixed URL into bucket and path.""" + if url.startswith('gs://'): + url = url[len('gs://'):] + idx = url.index('/') + bucket = url[:idx] + path = url[(idx + 1):] + return bucket, path + + def create_google_cloud_client(self): + # TODO: support client with credential? + client = storage.Client.create_anonymous_client() + return client diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/io/utils.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/io/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..79e9afc391bdc41785850e0ebb3522c97cc4ad53 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/io/utils.py @@ -0,0 +1,72 @@ +def as_str_any(value): + """Converts to `str` as `str(value)`, but use `as_str` for `bytes`. + + Args: + value: A object that can be converted to `str`. + + Returns: + A `str` object. + """ + if isinstance(value, bytes): + return as_str(value) + else: + return str(value) + + +def as_text(bytes_or_text, encoding="utf-8"): + """Returns the given argument as a unicode string. + + Args: + bytes_or_text: A `bytes`, `str`, or `unicode` object. + encoding: A string indicating the charset for decoding unicode. + + Returns: + A `str` (Python 3) object. + + Raises: + TypeError: If `bytes_or_text` is not a binary or unicode string. + """ + if isinstance(bytes_or_text, str): + return bytes_or_text + elif isinstance(bytes_or_text, bytes): + return bytes_or_text.decode(encoding) + else: + raise TypeError( + "Expected binary or unicode string, got %r" % bytes_or_text + ) + + +# Convert an object to a `str` in both Python 2 and 3. +as_str = as_text + + +def as_bytes(bytes_or_text, encoding="utf-8"): + """Converts either bytes or unicode to `bytes`, using utf-8 encoding for + text. + + Args: + bytes_or_text: A `bytes`, `str`, or `unicode` object. + encoding: A string indicating the charset for encoding unicode. + + Returns: + A `bytes` object. + + Raises: + TypeError: If `bytes_or_text` is not a binary or unicode string. + """ + if isinstance(bytes_or_text, str): + return bytes_or_text.encode(encoding) + elif isinstance(bytes_or_text, bytes): + return bytes_or_text + else: + raise TypeError( + "Expected binary or unicode string, got %r" % (bytes_or_text,) + ) + + +def parse_blob_url(url): + from urllib import parse + url_path = parse.urlparse(url) + + parts = url_path.path.lstrip('/').split('/', 1) + return url_path.netloc, tuple(parts) diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/multiprocessing.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/multiprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..b71773505c4473934340a0e573ebfcfe3db6f6a4 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/multiprocessing.py @@ -0,0 +1,13 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# ------------------------------------------------------------------------- +import multiprocessing as mp +import os + + +def get_start_method(): + return os.getenv('TORCH_PROFILER_START_METHOD', 'spawn') + + +__all__ = [x for x in dir(mp.get_context(get_start_method())) if not x.startswith('_')] +globals().update((name, getattr(mp.get_context(get_start_method()), name)) for name in __all__) diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/plugin.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/plugin.py new file mode 100644 index 0000000000000000000000000000000000000000..c93aedb88e0b43d50dbe71ee61f0ef4fdbf9134b --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/plugin.py @@ -0,0 +1,557 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# -------------------------------------------------------------------------- +import atexit +import gzip +import json +import os +import shutil +import sys +import tempfile +import threading +import time +from collections import OrderedDict +from queue import Queue + +import werkzeug +from tensorboard.plugins import base_plugin +from werkzeug import exceptions, wrappers + +from . import consts, io, utils +from .profiler import RunLoader +from .run import DistributedRunProfile, Run, RunProfile + +logger = utils.get_logger() + + +def decorate_headers(func): + def wrapper(*args, **kwargs): + headers = func(*args, **kwargs) + headers.extend(TorchProfilerPlugin.headers) + return headers + return wrapper + + +exceptions.HTTPException.get_headers = decorate_headers(exceptions.HTTPException.get_headers) + + +class TorchProfilerPlugin(base_plugin.TBPlugin): + """TensorBoard plugin for Torch Profiler.""" + + plugin_name = consts.PLUGIN_NAME + headers = [('X-Content-Type-Options', 'nosniff')] + CONTENT_TYPE = 'application/json' + + def __init__(self, context: base_plugin.TBContext): + """Instantiates TorchProfilerPlugin. + Args: + context: A base_plugin.TBContext instance. + """ + super(TorchProfilerPlugin, self).__init__(context) + self.logdir = io.abspath(context.logdir.rstrip('/')) + + self._load_lock = threading.Lock() + self._load_threads = [] + + self._runs = OrderedDict() + self._runs_lock = threading.Lock() + + self._temp_dir = tempfile.mkdtemp() + self._cache = io.Cache(self._temp_dir) + self._queue = Queue() + self._gpu_metrics_file_dict = {} + monitor_runs = threading.Thread(target=self._monitor_runs, name='monitor_runs', daemon=True) + monitor_runs.start() + + receive_runs = threading.Thread(target=self._receive_runs, name='receive_runs', daemon=True) + receive_runs.start() + + self.diff_run_cache = {} + self.diff_run_flatten_cache = {} + + def clean(): + logger.debug('starting cleanup...') + self._cache.__exit__(*sys.exc_info()) + logger.debug('remove temporary cache directory %s' % self._temp_dir) + shutil.rmtree(self._temp_dir) + + atexit.register(clean) + + def is_active(self): + """Returns whether there is relevant data for the plugin to process. + If there is no any pending run, hide the plugin + """ + if self.is_loading: + return True + else: + with self._runs_lock: + return bool(self._runs) + + def get_plugin_apps(self): + return { + '/index.js': self.static_file_route, + '/index.html': self.static_file_route, + '/trace_viewer_full.html': self.static_file_route, + '/trace_embedding.html': self.static_file_route, + '/runs': self.runs_route, + '/views': self.views_route, + '/workers': self.workers_route, + '/spans': self.spans_route, + '/overview': self.overview_route, + '/operation': self.operation_pie_route, + '/operation/table': self.operation_table_route, + '/operation/stack': self.operation_stack_route, + '/kernel': self.kernel_pie_route, + '/kernel/table': self.kernel_table_route, + '/kernel/tc_pie': self.kernel_tc_route, + '/trace': self.trace_route, + '/distributed/gpuinfo': self.dist_gpu_info_route, + '/distributed/overlap': self.comm_overlap_route, + '/distributed/waittime': self.comm_wait_route, + '/distributed/commops': self.comm_ops_route, + '/memory': self.memory_route, + '/memory_curve': self.memory_curve_route, + '/memory_events': self.memory_events_route, + '/module': self.module_route, + '/tree': self.op_tree_route, + '/diff': self.diff_run_route, + '/diffnode': self.diff_run_node_route, + } + + def frontend_metadata(self): + return base_plugin.FrontendMetadata(es_module_path='/index.js', disable_reload=True) + + @wrappers.Request.application + def runs_route(self, request: werkzeug.Request): + with self._runs_lock: + names = list(self._runs.keys()) + + data = { + 'runs': names, + 'loading': self.is_loading + } + return self.respond_as_json(data) + + @wrappers.Request.application + def views_route(self, request: werkzeug.Request): + name = request.args.get('run') + self._validate(run=name) + run = self._get_run(name) + views_list = [view.display_name for view in run.views] + return self.respond_as_json(views_list) + + @wrappers.Request.application + def workers_route(self, request: werkzeug.Request): + name = request.args.get('run') + view = request.args.get('view') + self._validate(run=name, view=view) + run = self._get_run(name) + return self.respond_as_json(run.get_workers(view)) + + @wrappers.Request.application + def spans_route(self, request: werkzeug.Request): + name = request.args.get('run') + worker = request.args.get('worker') + self._validate(run=name, worker=worker) + run = self._get_run(name) + return self.respond_as_json(run.get_spans(worker)) + + @wrappers.Request.application + def overview_route(self, request: werkzeug.Request): + profile = self._get_profile_for_request(request) + name = request.args.get('run') + run = self._get_run(name) + data = profile.overview + is_gpu_used = profile.has_runtime or profile.has_kernel or profile.has_memcpy_or_memset + normal_workers = [worker for worker in run.workers if worker != 'All'] + data['environments'] = [{'title': 'Number of Worker(s)', 'value': str(len(normal_workers))}, + {'title': 'Device Type', 'value': 'GPU' if is_gpu_used else 'CPU'}] + if profile.gpu_summary and profile.gpu_tooltip: + data['gpu_metrics'] = {'title': 'GPU Summary', + 'data': profile.gpu_summary, + 'tooltip': profile.gpu_tooltip} + + return self.respond_as_json(data) + + @wrappers.Request.application + def operation_pie_route(self, request: werkzeug.Request): + profile = self._get_profile_for_request(request) + + group_by = request.args.get('group_by') + if group_by == 'OperationAndInputShape': + return self.respond_as_json(profile.operation_pie_by_name_input) + else: + return self.respond_as_json(profile.operation_pie_by_name) + + @wrappers.Request.application + def operation_table_route(self, request: werkzeug.Request): + profile = self._get_profile_for_request(request) + + group_by = request.args.get('group_by') + if group_by == 'OperationAndInputShape': + return self.respond_as_json(profile.operation_table_by_name_input) + else: + return self.respond_as_json(profile.operation_table_by_name) + + @wrappers.Request.application + def operation_stack_route(self, request: werkzeug.Request): + profile = self._get_profile_for_request(request) + + op_name = request.args.get('op_name') + self._validate(op_name=op_name) + group_by = request.args.get('group_by') + input_shape = request.args.get('input_shape') + if group_by == 'OperationAndInputShape': + return self.respond_as_json(profile.operation_stack_by_name_input[str(op_name)+'###'+str(input_shape)]) + else: + return self.respond_as_json(profile.operation_stack_by_name[str(op_name)]) + + @wrappers.Request.application + def kernel_pie_route(self, request: werkzeug.Request): + profile = self._get_profile_for_request(request) + + return self.respond_as_json(profile.kernel_pie) + + @wrappers.Request.application + def kernel_table_route(self, request: werkzeug.Request): + profile = self._get_profile_for_request(request) + + group_by = request.args.get('group_by') + if group_by == 'Kernel': + return self.respond_as_json(profile.kernel_table) + else: + return self.respond_as_json(profile.kernel_op_table) + + @wrappers.Request.application + def kernel_tc_route(self, request: werkzeug.Request): + profile = self._get_profile_for_request(request) + + return self.respond_as_json(profile.tc_pie) + + @wrappers.Request.application + def trace_route(self, request: werkzeug.Request): + profile = self._get_profile_for_request(request) + + if not profile.has_kernel: # Pure CPU. + raw_data = self._cache.read(profile.trace_file_path) + if not profile.trace_file_path.endswith('.gz'): + raw_data = gzip.compress(raw_data, 1) + else: + file_with_gpu_metrics = self._gpu_metrics_file_dict.get(profile.trace_file_path) + if file_with_gpu_metrics: + raw_data = io.read(file_with_gpu_metrics) + else: + raw_data = self._cache.read(profile.trace_file_path) + if profile.trace_file_path.endswith('.gz'): + raw_data = gzip.decompress(raw_data) + raw_data = profile.append_gpu_metrics(raw_data) + + # write the data to temp file + fp = tempfile.NamedTemporaryFile('w+b', suffix='.json.gz', dir=self._temp_dir, delete=False) + fp.close() + # Already compressed, no need to gzip.open + with open(fp.name, mode='wb') as file: + file.write(raw_data) + self._gpu_metrics_file_dict[profile.trace_file_path] = fp.name + + headers = [('Content-Encoding', 'gzip')] + headers.extend(TorchProfilerPlugin.headers) + return werkzeug.Response(raw_data, content_type=TorchProfilerPlugin.CONTENT_TYPE, headers=headers) + + @wrappers.Request.application + def dist_gpu_info_route(self, request: werkzeug.Request): + profile = self._get_distributed_profile_for_request(request) + return self.respond_as_json(profile.gpu_info) + + @wrappers.Request.application + def comm_overlap_route(self, request: werkzeug.Request): + profile = self._get_distributed_profile_for_request(request) + return self.respond_as_json(profile.steps_to_overlap) + + @wrappers.Request.application + def comm_wait_route(self, request: werkzeug.Request): + profile = self._get_distributed_profile_for_request(request) + return self.respond_as_json(profile.steps_to_wait) + + @wrappers.Request.application + def comm_ops_route(self, request: werkzeug.Request): + profile = self._get_distributed_profile_for_request(request) + return self.respond_as_json(profile.comm_ops) + + @wrappers.Request.application + def memory_route(self, request: werkzeug.Request): + profile = self._get_profile_for_request(request) + start_ts = request.args.get('start_ts', None) + end_ts = request.args.get('end_ts', None) + memory_metric = request.args.get('memory_metric', 'KB') + if start_ts is not None: + start_ts = int(start_ts) + if end_ts is not None: + end_ts = int(end_ts) + + return self.respond_as_json( + profile.get_memory_stats(start_ts=start_ts, end_ts=end_ts, memory_metric=memory_metric), True) + + @wrappers.Request.application + def memory_curve_route(self, request: werkzeug.Request): + profile = self._get_profile_for_request(request) + time_metric = request.args.get('time_metric', 'ms') + memory_metric = request.args.get('memory_metric', 'MB') + return self.respond_as_json( + profile.get_memory_curve(time_metric=time_metric, memory_metric=memory_metric), True) + + @wrappers.Request.application + def memory_events_route(self, request: werkzeug.Request): + profile = self._get_profile_for_request(request) + start_ts = request.args.get('start_ts', None) + end_ts = request.args.get('end_ts', None) + time_metric = request.args.get('time_metric', 'ms') + memory_metric = request.args.get('memory_metric', 'KB') + if start_ts is not None: + start_ts = int(start_ts) + if end_ts is not None: + end_ts = int(end_ts) + + return self.respond_as_json( + profile.get_memory_events(start_ts, end_ts, time_metric=time_metric, + memory_metric=memory_metric), True) + + @wrappers.Request.application + def module_route(self, request: werkzeug.Request): + profile = self._get_profile_for_request(request) + content = profile.get_module_view() + if content: + return self.respond_as_json(content, True) + else: + name = request.args.get('run') + worker = request.args.get('worker') + span = request.args.get('span') + raise exceptions.NotFound('could not find the run for %s/%s/%s' % (name, worker, span)) + + @wrappers.Request.application + def op_tree_route(self, request: werkzeug.Request): + profile = self._get_profile_for_request(request) + content = profile.get_operator_tree() + return self.respond_as_json(content, True) + + @wrappers.Request.application + def diff_run_route(self, request: werkzeug.Request): + base, exp = self.get_diff_runs(request) + diff_stats = self.get_diff_status(base, exp) + content = diff_stats.get_diff_tree_summary() + return self.respond_as_json(content, True) + + @wrappers.Request.application + def diff_run_node_route(self, request: werkzeug.Request): + base, exp = self.get_diff_runs(request) + path = request.args.get('path', '0') + stats_dict = self.get_diff_stats_dict(base, exp) + diff_stat = stats_dict.get(path) + if diff_stat is None: + raise exceptions.NotFound('could not find diff run for %s' % (path)) + content = diff_stat.get_diff_node_summary(path) + return self.respond_as_json(content, True) + + @wrappers.Request.application + def static_file_route(self, request: werkzeug.Request): + filename = os.path.basename(request.path) + extension = os.path.splitext(filename)[1] + if extension == '.html': + mimetype = 'text/html' + elif extension == '.css': + mimetype = 'text/css' + elif extension == '.js': + mimetype = 'application/javascript' + else: + mimetype = 'application/octet-stream' + filepath = os.path.join(os.path.dirname(__file__), 'static', filename) + try: + with open(filepath, 'rb') as infile: + contents = infile.read() + except IOError: + raise exceptions.NotFound('404 Not Found') + return werkzeug.Response( + contents, content_type=mimetype, headers=TorchProfilerPlugin.headers + ) + + @staticmethod + def respond_as_json(obj, compress: bool = False): + content = json.dumps(obj) + headers = [] + headers.extend(TorchProfilerPlugin.headers) + if compress: + content_bytes = content.encode('utf-8') + raw_data = gzip.compress(content_bytes, 1) + headers.append(('Content-Encoding', 'gzip')) + return werkzeug.Response(raw_data, content_type=TorchProfilerPlugin.CONTENT_TYPE, headers=headers) + else: + return werkzeug.Response(content, content_type=TorchProfilerPlugin.CONTENT_TYPE, headers=headers) + + @property + def is_loading(self): + with self._load_lock: + return bool(self._load_threads) + + def get_diff_runs(self, request: werkzeug.Request): + name = request.args.get('run') + span = request.args.get('span') + worker = request.args.get('worker') + self._validate(run=name, worker=worker, span=span) + base = self._get_profile(name, worker, span) + + exp_name = request.args.get('exp_run') + exp_span = request.args.get('exp_span') + exp_worker = request.args.get('exp_worker') + self._validate(exp_run=exp_name, exp_worker=exp_worker, exp_span=exp_span) + exp = self._get_profile(exp_name, exp_worker, exp_span) + + return base, exp + + def get_diff_status(self, base: RunProfile, exp: RunProfile): + key = (base, exp) + diff_stats = self.diff_run_cache.get(key) + if diff_stats is None: + diff_stats = base.compare_run(exp) + self.diff_run_cache[key] = diff_stats + + return diff_stats + + def get_diff_stats_dict(self, base: RunProfile, exp: RunProfile): + key = (base, exp) + stats_dict = self.diff_run_flatten_cache.get(key) + if stats_dict is None: + diff_stats = self.get_diff_status(base, exp) + stats_dict = diff_stats.flatten_diff_tree() + self.diff_run_flatten_cache[key] = stats_dict + return stats_dict + + def _monitor_runs(self): + logger.info('Monitor runs begin') + + try: + touched = set() + while True: + try: + logger.debug('Scan run dir') + run_dirs = self._get_run_dirs() + + has_dir = False + # Assume no deletion on run directories, trigger async load if find a new run + for run_dir in run_dirs: + has_dir = True + if run_dir not in touched: + touched.add(run_dir) + logger.info('Find run directory %s', run_dir) + # Use threading to avoid UI stall and reduce data parsing time + t = threading.Thread(target=self._load_run, args=(run_dir,)) + t.start() + with self._load_lock: + self._load_threads.append(t) + + if not has_dir: + # handle directory removed case. + self._runs.clear() + except Exception as ex: + logger.warning('Failed to scan runs. Exception=%s', ex, exc_info=True) + + time.sleep(consts.MONITOR_RUN_REFRESH_INTERNAL_IN_SECONDS) + except Exception: + logger.exception('Failed to start monitor_runs') + + def _receive_runs(self): + while True: + run: Run = self._queue.get() + if run is None: + continue + + logger.info('Add run %s', run.name) + with self._runs_lock: + is_new = run.name not in self._runs + self._runs[run.name] = run + if is_new: + self._runs = OrderedDict(sorted(self._runs.items())) + + def _get_run_dirs(self): + """Scan logdir, find PyTorch Profiler run directories. + A directory is considered to be a run if it contains 1 or more *.pt.trace.json[.gz]. + E.g. there are 2 runs: run1, run2 + /run1 + /[worker1].pt.trace.json.gz + /[worker2].pt.trace.json.gz + /run2 + /[worker1].pt.trace.json + """ + for root, _, files in io.walk(self.logdir): + for file in files: + if utils.is_chrome_trace_file(file): + yield root + break + + def _load_run(self, run_dir): + try: + name = self._get_run_name(run_dir) + logger.info('Load run %s', name) + # Currently, assume run data is immutable, so just load once + loader = RunLoader(name, run_dir, self._cache) + run = loader.load() + logger.info('Run %s loaded', name) + self._queue.put(run) + except Exception as ex: + logger.warning('Failed to load run %s. Exception=%s', ex, name, exc_info=True) + + t = threading.current_thread() + with self._load_lock: + try: + self._load_threads.remove(t) + except ValueError: + logger.warning('could not find the thread {}'.format(run_dir)) + + def _get_run(self, name) -> Run: + with self._runs_lock: + run = self._runs.get(name, None) + + if run is None: + raise exceptions.NotFound('could not find the run for %s' % (name)) + + return run + + def _get_run_name(self, run_dir): + logdir = io.abspath(self.logdir) + if run_dir == logdir: + name = io.basename(run_dir) + else: + name = io.relpath(run_dir, logdir) + return name + + def _get_profile_for_request(self, request: werkzeug.Request) -> RunProfile: + name = request.args.get('run') + span = request.args.get('span') + worker = request.args.get('worker') + self._validate(run=name, worker=worker) + profile = self._get_profile(name, worker, span) + if not isinstance(profile, RunProfile): + raise exceptions.BadRequest('Get an unexpected profile type %s for %s/%s' % (type(profile), name, worker)) + + return profile + + def _get_distributed_profile_for_request(self, request: werkzeug.Request) -> DistributedRunProfile: + name = request.args.get('run') + span = request.args.get('span') + self._validate(run=name) + profile = self._get_profile(name, 'All', span) + if not isinstance(profile, DistributedRunProfile): + raise exceptions.BadRequest('Get an unexpected distributed profile type %s for %s' % (type(profile), name)) + + return profile + + def _get_profile(self, name, worker, span): + run = self._get_run(name) + profile = run.get_profile(worker, span) + if profile is None: + raise exceptions.NotFound('could not find the profile for %s/%s/%s ' % (name, worker, span)) + return profile + + def _validate(self, **kwargs): + for name, v in kwargs.items(): + if v is None: + raise exceptions.BadRequest('Must specify %s in request url' % (name)) diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/__init__.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9ca062abf58245753361a96890a2ee1ccdec42fb --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/__init__.py @@ -0,0 +1,7 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# -------------------------------------------------------------------------- + +from .loader import RunLoader + +__all__ = ['RunLoader'] diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/communication.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/communication.py new file mode 100644 index 0000000000000000000000000000000000000000..0894ea3966f8814d4e18884920ee1187ef81e80b --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/communication.py @@ -0,0 +1,91 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# ------------------------------------------------------------------------- +from typing import Dict, List, Tuple + +from .. import utils +from .node import CommunicationNode +from .range_utils import get_ranges_sum, merge_ranges + +logger = utils.get_logger() + + +def generate_communication_nodes( + communication_data: Dict[int, CommunicationNode], + steps: List[Tuple[int, int]], + steps_names: List[str]): + comm_node_list: List[CommunicationNode] = [] + + # Sort the communication node according the start time, this is for correlating communication node between workers + for comm_node in communication_data.values(): + comm_node.kernel_ranges.sort(key=lambda x: (x[0], -x[1])) + comm_node_list.append(comm_node) + comm_node_list.sort(key=lambda x: (x.start_time, -x.end_time)) + + # Find each communication node belong to which step + index = 0 + valid_steps = len(steps) + for comm_node in comm_node_list: + while index < valid_steps: + if comm_node.start_time >= steps[index][0] and comm_node.end_time <= steps[index][1]: + comm_node.step_name = steps_names[index] + break + elif comm_node.start_time >= steps[index][1]: + index += 1 + else: + logger.error('Found a communication op not belong to any step.') + break + if index >= valid_steps: + logger.error('Found communication ops not belong to any step. ') + break + + return comm_node_list + + +def analyze_communication_nodes(comm_node_list: List[CommunicationNode])\ + -> Tuple[Dict[str, Tuple[int, int]], Dict[str, List[int]]]: + step_comm_stats: Dict[str, Tuple[int, int]] = {} + total_comm_stats: Dict[str, Tuple[int, int, List, List]] = {} + + step_to_comm_ranges: Dict[str, Tuple[List, List]] = {} + for comm_node in comm_node_list: + if comm_node.step_name not in step_to_comm_ranges: + step_to_comm_ranges[comm_node.step_name] = [[], []] + step_to_comm_ranges[comm_node.step_name][0].extend(comm_node.kernel_ranges) + step_to_comm_ranges[comm_node.step_name][1].extend(comm_node.real_time_ranges) + + if comm_node.name not in total_comm_stats: + total_comm_stats[comm_node.name] = [0, 0, [], []] + total_comm_stats[comm_node.name][0] += 1 + bytes_one_value = 0 + if comm_node.input_shape: + for i in range(len(comm_node.input_shape)): + if comm_node.input_type[i] == 'long int': + bytes_one_value = 8 + elif comm_node.input_type[i] == 'float': + bytes_one_value = 4 + elif comm_node.input_type[i] == 'int': + bytes_one_value = 4 + elif comm_node.input_type[i] == 'c10::Half': + bytes_one_value = 2 + else: + logger.warning('Found an unknown tensor type: {}'.format(comm_node.input_type[i])) + bytes_one_value = 0 + total_size = 1 + for size in comm_node.input_shape[i]: + total_size *= size + total_comm_stats[comm_node.name][1] += total_size * bytes_one_value + total_comm_stats[comm_node.name][2].extend(comm_node.kernel_ranges) + total_comm_stats[comm_node.name][3].extend(comm_node.real_time_ranges) + + for step, comm_ranges in step_to_comm_ranges.items(): + step_comm_stats[step] = [ + get_ranges_sum(merge_ranges(comm_ranges[0])), + get_ranges_sum(merge_ranges(comm_ranges[1])) + ] + + for _, stats in total_comm_stats.items(): + stats[2] = get_ranges_sum(merge_ranges(stats[2])) + stats[3] = get_ranges_sum(merge_ranges(stats[3])) + + return step_comm_stats, total_comm_stats diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/data.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/data.py new file mode 100644 index 0000000000000000000000000000000000000000..3f9e296d5b7f02272cf323fd49f322b8a0d9763a --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/data.py @@ -0,0 +1,356 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# -------------------------------------------------------------------------- +import gzip +import io as sysio +import json +import re +import tempfile +from json.decoder import JSONDecodeError +from typing import Dict, List, Optional + +from .. import io, utils +from ..utils import href +from . import trace +from .communication import analyze_communication_nodes +from .event_parser import CommLibTypes, EventParser, ProfileRole +from .gpu_metrics_parser import GPUMetricsParser +from .kernel_parser import KernelParser +from .memory_parser import MemoryParser, MemorySnapshot +from .node import OperatorNode +from .op_agg import ModuleAggregator +from .overall_parser import OverallParser +from .tensor_cores_parser import TensorCoresParser +from .trace import BaseEvent, EventTypes, MemoryEvent + +logger = utils.get_logger() + + +class RunProfileData(object): + def __init__(self, worker: str, span: str, trace_json: Dict): + self.worker = worker + self.span = span + + # metadatas + self.is_pytorch_lightning = trace_json.get('Framework', None) == 'pytorch-lightning' + self.data_schema_version = trace_json.get('schemaVersion', None) + self.distributed_info = trace_json.get('distributedInfo', None) + self.device_props = trace_json.get('deviceProperties', None) + + self.profiler_start_ts = float('inf') + self.events: List[BaseEvent] = [] + + trace_body = trace_json['traceEvents'] + fwd_bwd_events = [] + for data in trace_body: + if data.get('cat') == 'forward_backward': + fwd_bwd_events.append(data) + else: + event = trace.create_event(data, self.is_pytorch_lightning) + if event is not None: + self.profiler_start_ts = min(self.profiler_start_ts, event.ts) + self.events.append(event) + + self.events.sort(key=lambda e: e.ts) + self.forward_backward_events = trace.create_association_events(fwd_bwd_events) + + self.trace_file_path: str = None + + # Event Parser results + self.tid2tree: Dict[int, OperatorNode] = None + self.pl_tid2tree: Dict[int, OperatorNode] = None + self.used_devices = [] + self.use_dp: bool = False + self.use_ddp: bool = False + self.comm_lib = None + self.has_runtime: bool = False + self.has_kernel: bool = False + self.has_communication: bool = False + self.has_memcpy_or_memset: bool = False + self.role_ranges = None + self.steps_costs = None + self.steps_names = None + self.avg_costs = None + + # GPU parser + self.gpu_metrics_parser: GPUMetricsParser = None + + # Operator aggregator + self.op_list_groupby_name = None + self.op_list_groupby_name_input = None + self.stack_lists_group_by_name = None + self.stack_lists_group_by_name_input = None + self.kernel_list_groupby_name_op = None + + # Kernel and Tensor Core + self.kernel_stat = None + self.tc_ratio = None + self.tc_eligible_ops_kernel_ratio = None + self.tc_used_ratio = None # If it's a pure CPU run, then this keeps as None. + + # Communicator + self.comm_node_list = None + self.comm_overlap_costs = None + self.memory_snapshot: Optional[MemorySnapshot] = None + + # recommendation based on analysis result. + self.recommendations = [] + + @staticmethod + def parse(worker, span, path, cache_dir): + trace_path, trace_json = RunProfileData._preprocess_file(path, cache_dir) + + profile = RunProfileData.from_json(worker, span, trace_json) + profile.trace_file_path = trace_path + return profile + + @staticmethod + def from_json(worker, span, trace_json: Dict): + profile = RunProfileData(worker, span, trace_json) + with utils.timing('Data processing'): + profile.process() + profile.analyze() + return profile + + @staticmethod + def _preprocess_file(trace_path, cache_dir): + if not io.exists(trace_path): + raise FileNotFoundError(trace_path) + + data = io.read(trace_path) + if trace_path.endswith('.gz'): + data = gzip.decompress(data) + + json_reencode = False + try: + trace_json = json.loads(data) + except JSONDecodeError as e: + # Kineto may export json file with non-ascii code. before this is fixed, use a workaround + # to handle JSONDecodeError, re-encode it and save to a temp file + try: + trace_json = json.loads(data, strict=False) + except JSONDecodeError: + with sysio.StringIO() as fout: + str_data = data.decode('utf-8') + # only replace the N/A without surrounding double quote + fout.write(re.sub(r'(? 24 * 3600 * 1000: + del trace_json['traceEvents'][end_index] + json_reencode = True + + if json_reencode: + fp = tempfile.NamedTemporaryFile('w+t', suffix='.json.gz', dir=cache_dir, delete=False) + fp.close() + with gzip.open(fp.name, mode='wt') as fzip: + fzip.write(json.dumps(trace_json)) + trace_path = fp.name + + return trace_path, trace_json + + def process(self): + with utils.timing('EventParser.parse'): + parser = EventParser() + self.tid2tree, self.pl_tid2tree = parser.parse(self.events, self.forward_backward_events) + + self.has_runtime = parser.has_runtime + self.has_kernel = parser.has_kernel + self.has_memcpy_or_memset = parser.has_memcpy_or_memset + self.steps_names = parser.steps_names + self.used_devices = sorted(list(parser.used_devices)) + self.use_dp = parser.use_dp + self.use_ddp = parser.use_ddp + self.role_ranges = parser.role_ranges + + self.comm_lib = parser.comm_lib + self.has_communication = parser.has_communication + self.comm_node_list = parser.comm_node_list + + # Starting aggregate + logger.debug('ModuleAggregator') + with utils.timing('ModuleAggregator aggegation'): + module_aggregator = ModuleAggregator() + module_aggregator.aggregate(self.tid2tree) + self.op_list_groupby_name = module_aggregator.op_list_groupby_name + self.op_list_groupby_name_input = module_aggregator.op_list_groupby_name_input + self.stack_lists_group_by_name = module_aggregator.stack_lists_group_by_name + self.stack_lists_group_by_name_input = module_aggregator.stack_lists_group_by_name_input + self.kernel_list_groupby_name_op = module_aggregator.kernel_list_groupby_name_op + + logger.debug('OverallParser') + with utils.timing('OverallParser aggegation'): + overall_parser = OverallParser() + overall_parser.aggregate(parser.steps, parser.role_ranges) + self.avg_costs = overall_parser.avg_costs + self.steps_costs = overall_parser.steps_costs + self.comm_overlap_costs = overall_parser.communication_overlap + + logger.debug('GPUMetricsParser') + self.gpu_metrics_parser = GPUMetricsParser.parse_events( + self.events, parser.global_start_ts, parser.global_end_ts, parser.steps[0][0], parser.steps[-1][1]) + + logger.debug('TensorCoresParser') + tensorcores_parser = TensorCoresParser.parse_events( + self.tid2tree, module_aggregator.ops, self.gpu_metrics_parser.gpu_ids) + self.tc_eligible_ops_kernel_ratio = tensorcores_parser.tc_eligible_ops_kernel_ratio + self.tc_ratio = tensorcores_parser.tc_ratio + + if self.has_kernel: + logger.debug('KernelParser') + with utils.timing('parse kernels'): + kernel_parser = KernelParser() + kernel_parser.parse_events(self.events) + self.kernel_stat = kernel_parser.kernel_stat + self.tc_used_ratio = kernel_parser.tc_used_ratio + + memory_events = self._memory_events() + if memory_events: + memory_parser = MemoryParser(memory_events) + self.memory_snapshot = memory_parser.find_memory_nodes(self.tid2tree) + + def analyze(self): + self.recommendations = [] + + dataloader_ratio = self.avg_costs.costs[ProfileRole.DataLoader] / self.avg_costs.costs[ProfileRole.Total] + if dataloader_ratio > 0.05: + percentage = dataloader_ratio * 100 + url = 'https://pytorch.org/docs/stable/data.html#single-and-multi-process-data-loading' + self.recommendations.append( + f'This run has high time cost on input data loading. {percentage:.1f}% of the step ' + + "time is in DataLoader. You could try to set num_workers on DataLoader's construction " + + f"and {href('enable multi-processes on data loading', url)}." + ) + + self._analyze_distributed_metrics() + self._analyze_gpu_metrics() + + if self.device_props: + # Tensor Cores feature is available on GPU cards with compute capability >= 7.0 + # https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications + major = self.device_props[0].get('computeMajor') + # If it's a pure CPU run, then self.tc_used_ratio is None, this rule will not be triggered. + if (major is not None and major >= 7 and + self.tc_used_ratio == 0.0 and + self.tc_eligible_ops_kernel_ratio > 0.0): + url = 'https://pytorch.org/docs/stable/amp.html' + self.recommendations.append( + f'Kernels with {round(self.tc_eligible_ops_kernel_ratio * 100)}%' + ' time are launched by Tensor Cores eligible operators. ' + f"You could enable {href('Automatic Mixed Precision', url)} to speedup by using FP16.") + + # Memory related + if self.memory_snapshot: + for (dev_type, dev_id), peak_mem in self.memory_snapshot.get_peak_memory().items(): + if dev_type == -1: # ignore cpu + continue + total_mem = self.device_props[dev_id].get('totalGlobalMem') + if total_mem is not None and peak_mem > total_mem * 0.9: + percentage = peak_mem / total_mem * 100 + total_mem_gb = total_mem / 1024 / 1024 / 1024 + ckp_url = 'https://pytorch.org/docs/stable/checkpoint.html' + amp_url = 'https://pytorch.org/docs/stable/amp.html' + self.recommendations.append( + f'Device memory usage is at the limit of device memory capacity ' + f'({percentage:.1f}% of {total_mem_gb:.1f}GB on GPU{dev_id}). ' + 'To get better value of your GPU or to use larger batch size for training, please refer to ' + f"{href('Gradient Checkpoint', ckp_url)} or {href('Automatic Mixed Precision', amp_url)}.") + break + + def _analyze_distributed_metrics(self): + if self.use_dp and len(self.used_devices) > 1: + url = 'https://pytorch.org/docs/stable/notes/cuda.html#cuda-nn-ddp-instead' + self.recommendations.append( + f"It is recommended to {href('use DistributedDataParallel instead of DataParallel', url)}" + ' to do multi-GPU training.') + + if self.use_ddp and CommLibTypes.Nccl not in self.comm_lib and self.device_props: + for device_prop in self.device_props: + major = device_prop.get('computeMajor') + minor = device_prop.get('computeMinor') + if major is None or minor is None: + continue + compute_capability = '{}.{}'.format(major, minor) + if float(compute_capability) >= 3.5: + text = ( + 'Nccl backend is currently the fastest and highly recommended backend' + ' when using DDP for training.') + self.recommendations.append(text) + break + + communication_ratio = self.avg_costs.costs[ProfileRole.Communication] / self.avg_costs.costs[ProfileRole.Total] + if communication_ratio > 0.1: + percentage = communication_ratio * 100 + compress_url = 'https://pytorch.org/docs/stable/ddp_comm_hooks.html', + grad_acc_url = 'https://towardsdatascience.com/what-is-gradient-accumulation-in-deep-learning-ec034122cfa' + lamb_url = 'https://nvidia.github.io/apex/optimizers.html#apex.optimizers.FusedLAMB' + self.recommendations.append( + f'This run has high time cost on communication. {percentage:.1f}% of the step time is in ' + f"communication. You could try {href('Gradient Compression', compress_url)} or " + f"{href('Gradient Accumulation', grad_acc_url)} or increase the batch size. " + 'Note: Gradient accumulation will increase global effective batch size, which may hurt model ' + f"convergence and accuracy. For such case, you may want to evaluate {href('LAMB optimizer', lamb_url)}." + ) + + def _memory_events(self) -> List[MemoryEvent]: + memory_events = [e for e in self.events if e.type == EventTypes.MEMORY] + memory_events.sort(key=lambda e: e.ts) + return memory_events + + def _analyze_gpu_metrics(self): + def get_gpus_str(gpus): + gpu_list_str = str(gpus[0]) + for i in range(1, len(gpus)): + if i == len(gpus) - 1: + gpu_list_str += 'and {}'.format(gpus[i]) + else: + gpu_list_str += ', {}'.format(gpus[i]) + has_str = 'has' if len(gpu_list_str) == 1 else 'have' + return gpu_list_str, has_str + + low_util_gpus = [] + for gpu_id in self.gpu_metrics_parser.gpu_ids: + if self.gpu_metrics_parser.gpu_utilization[gpu_id] < 0.5: + low_util_gpus.append(gpu_id) + if len(low_util_gpus) > 0: + gpu_list_str, has_str = get_gpus_str(low_util_gpus) + text = 'GPU {} {} low utilization. You could try to ' \ + 'increase batch size to improve. Note: Increasing batch size ' \ + 'may affect the speed and stability of model convergence.'.format(gpu_list_str, has_str) + self.recommendations.append(text) + + +class DistributedRunProfileData: + def __init__(self, run_profile_data: RunProfileData): + self.worker = run_profile_data.worker + self.span = run_profile_data.span + self.steps_names = run_profile_data.steps_names + self.has_communication = run_profile_data.has_communication + self.comm_lib = run_profile_data.comm_lib + self.comm_node_list = run_profile_data.comm_node_list + self.comm_overlap_costs = run_profile_data.comm_overlap_costs + self.used_devices = run_profile_data.used_devices + self.device_props = run_profile_data.device_props + self.distributed_info = run_profile_data.distributed_info + + self.total_comm_stats = None + self.step_comm_stats = None + + def communication_parse(self): + self.step_comm_stats, self.total_comm_stats = analyze_communication_nodes(self.comm_node_list) diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/diffrun/__init__.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/diffrun/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3dae52e351eaa75fc65788904de951cc68aba4ab --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/diffrun/__init__.py @@ -0,0 +1,3 @@ +from .contract import DiffStats, OpAgg +from .tree import (DiffNode, compare_op_tree, diff_summary, print_node, + print_ops) diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/diffrun/contract.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/diffrun/contract.py new file mode 100644 index 0000000000000000000000000000000000000000..ce0cba35de7b752ea6dab966fd7febec77bf6f02 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/diffrun/contract.py @@ -0,0 +1,99 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# ------------------------------------------------------------------------- +from collections import namedtuple +from typing import Dict, List + +OpAgg = namedtuple('OpAgg', [ + 'name', + 'calls', + 'host_duration', + 'device_duration', + 'self_host_duration', + 'self_device_duration']) + + +class OpStats: + def __init__(self, + name, + duration, + device_duration, + total_duration, + aggs: List[OpAgg]): + self.name = name + self.duration = duration + self.device_duration = device_duration + self.total_duration = total_duration + self.op_aggs = aggs + + def __str__(self) -> str: + return f'{self.name}: {self.duration}/{self.device_duration}/{len(self.op_aggs)}' + + +class DiffStats: + def __init__(self, left: OpStats, right: OpStats): + self.left = left + self.right = right + self.children: List[DiffStats] = [] + + def flatten_diff_tree(self) -> Dict[str, 'DiffStats']: + result: Dict[str, DiffStats] = {} + + def traverse(node: DiffStats, path: str): + result[path] = node + for i, child in enumerate(node.children): + traverse(child, f'{path}-{i}') + + traverse(self, '0') + return result + + def to_dict(self): + d = { + 'left': { + 'name': self.left.name, + 'duration': self.left.duration, + 'device_duration': self.left.device_duration, + 'total_duration': self.left.total_duration, + 'aggs': [] + }, + 'right': { + 'name': self.right.name, + 'duration': self.right.duration, + 'device_duration': self.right.device_duration, + 'total_duration': self.right.total_duration, + 'aggs': [] + } + } + + for agg in self.left.op_aggs: + d['left']['aggs'].append(agg._asdict()) + + for agg in self.right.op_aggs: + d['right']['aggs'].append(agg._asdict()) + + return d + + def get_diff_tree_summary(self): + def traverse_node_recursive(node: DiffStats): + d = node.to_dict() + + d['children'] = [] + for c in node.children: + d['children'].append(traverse_node_recursive(c)) + + return d + + return traverse_node_recursive(self) + + def get_diff_node_summary(self, path: str): + def traverse_node(node: DiffStats, path: str): + d = node.to_dict() + d['path'] = path + return d + + d = traverse_node(self, path) + d['children'] = [] + for i, c in enumerate(self.children): + d['children'].append(traverse_node(c, f'{path}-{i}')) + + return d diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/diffrun/operator.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/diffrun/operator.py new file mode 100644 index 0000000000000000000000000000000000000000..ca0c1f7d4aafbd78228f3a5e70653d48903a97d4 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/diffrun/operator.py @@ -0,0 +1,124 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# ------------------------------------------------------------------------- +from abc import ABCMeta +from typing import List, Tuple, Union + +from ..node import DeviceNode, OperatorNode +from ..op_agg import aggregate_ops +from .contract import OpAgg + + +class Operator(metaclass=ABCMeta): + def __init__(self, name) -> None: + self.name: str = name + + def __str__(self) -> str: + return f'{self.name}: {self.duration}' + + @property + def duration(self) -> int: + return 0 + + @property + def device_duration(self) -> int: + return 0 + + @property + def total_duration(self): + return self.device_duration or self.duration + + def aggregate_ops(self): + ops, _ = self.get_operators_and_kernels() + agg_result = aggregate_ops(ops, [lambda x: x.name])[0] + for agg in agg_result.values(): + yield OpAgg( + agg.name, + agg.calls, + agg.host_duration, + agg.device_duration, + agg.self_host_duration, + agg.self_device_duration) + + def get_operators_and_kernels(self) -> Tuple[List[OperatorNode], List[DeviceNode]]: + return [], [] + + +class BlankOp(Operator): + def __init__(self) -> None: + super().__init__('Blank') + + +class UnknownOp(Operator): + def __init__(self, device_duration: int, duration: int) -> None: + super().__init__('Unknown') + self.device_duration = device_duration + self.duration = duration + + @property + def duration(self) -> int: + return self.duration + + @property + def device_duration(self) -> int: + return self.device_duration + + +class Operators(Operator): + def __init__(self, nodes: Union[OperatorNode, List[OperatorNode]]): + if not nodes: + raise ValueError('the operator node is None or empty') + if isinstance(nodes, OperatorNode): + super().__init__(nodes.name) + elif isinstance(nodes, list): + super().__init__('CompositeNodes') + + self.op_nodes: Union[OperatorNode, List[OperatorNode]] = nodes + + @property + def duration(self): + if isinstance(self.op_nodes, list): + return sum(n.duration for n in self.op_nodes) + else: + return self.op_nodes.duration + + @property + def device_duration(self): + if isinstance(self.op_nodes, list): + return sum(n.device_duration for n in self.op_nodes) + else: + return self.op_nodes.device_duration + + @property + def total_duration(self): + if isinstance(self.op_nodes, list): + return sum(n.device_duration or n.duration for n in self.op_nodes) + else: + return self.op_nodes.device_duration or self.op_nodes.duration + + def __str__(self) -> str: + if isinstance(self.op_nodes, list): + return f'{self.name}: {len(self.op_nodes)}: {self.op_nodes[0].name}: {self.total_duration}' + else: + return f'{self.name}: {self.op_nodes.__class__.__name__}: {self.total_duration}' + + def get_operators_and_kernels(self) -> Tuple[List[OperatorNode], List[DeviceNode]]: + if isinstance(self.op_nodes, list): + nodes = self.op_nodes + else: + nodes = [self.op_nodes] + + ops: List[OperatorNode] = [] + kernels: List[DeviceNode] = [] + for n in nodes: + o, k = n.get_operator_and_kernels() + ops.extend(o) + kernels.extend(k) + return ops, kernels + + +def create_operator(op_nodes: Union[OperatorNode, List[OperatorNode]]) -> Operator: + if op_nodes: + return Operators(op_nodes) + else: + return BlankOp() diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/diffrun/tree.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/diffrun/tree.py new file mode 100644 index 0000000000000000000000000000000000000000..412c677ce0faacf5c22bbb4f884a6b1adb936586 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/diffrun/tree.py @@ -0,0 +1,165 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# ------------------------------------------------------------------------- +import sys +from typing import Generator, List, Union + +from ..node import (BackwardNode, DataLoaderNode, ModuleNode, OperatorNode, + OptimizerNode, ProfilerStepNode) +from .contract import DiffStats, OpStats +from .operator import Operator, Operators, create_operator + +INDENT = ' ' +RUN_NODE_TYPES = (BackwardNode, DataLoaderNode, ModuleNode, OptimizerNode, ProfilerStepNode) + + +class DiffNode: + def __init__(self, left: Operator, right: Operator): + self.left: Operator = left + self.right: Operator = right + self.children: List[DiffNode] = [] + + def build_tree(self): + '''build the children from the left_node and right_node''' + if not isinstance(self.left, Operators) or not isinstance(self.right, Operators): + # TODO: do we need calculate the stats or not? + return + + if isinstance(self.left.op_nodes, OperatorNode) and isinstance(self.right.op_nodes, OperatorNode): + # simple node match. + diff_nodes = list(DiffNode.compare_operator_nodes( + self.left.op_nodes.children, self.right.op_nodes.children)) + if diff_nodes: + self.children.extend(diff_nodes) + elif isinstance(self.left.op_nodes, list) and isinstance(self.right.op_nodes, list): + # compare two list + diff_nodes = list(DiffNode.compare_operator_nodes(self.left.op_nodes, self.right.op_nodes)) + if diff_nodes: + self.children.extend(diff_nodes) + else: + # one single item and one list + pass + # TODO: do we need statistic the stats for both operator and kernel here? + + @staticmethod + def create_node( + left: Union[OperatorNode, List[OperatorNode]], + right: Union[OperatorNode, List[OperatorNode]]) -> 'DiffNode': + if isinstance(left, list) and len(left) == 1: + left = left[0] + if isinstance(right, list) and len(right) == 1: + right = right[0] + + node = DiffNode(create_operator(left), create_operator(right)) + node.build_tree() + return node + + @staticmethod + def compare_operator_nodes( + left_nodes: List[OperatorNode], + right_nodes: List[OperatorNode]) -> Generator['DiffNode', None, None]: + '''Given two OperatorNode lists, find the DataLoader/Module/Backward/Optimizer node and create the child list DiffNode + ''' + right_keys = [(type(r), r.name) for r in right_nodes] + + # find matching points in the two list + matched_paris = [] + key_index = 0 + for i, left_node in enumerate(left_nodes): + if not isinstance(left_node, RUN_NODE_TYPES): + # only handle DataLoader/Module/Backward/Optimizer nodes + continue + + for j in range(key_index, len(right_keys)): + if right_keys[j] == (type(left_node), left_node.name): + matched_paris.append((i, j)) + key_index = j + 1 + break + + if not matched_paris: + # there is not any matching points. + return + + # split the two list by the matching points + l_iter = 0 + r_iter = 0 + + for (l, r) in matched_paris: + left_child = left_nodes[l_iter:l] + right_child = right_nodes[r_iter:r] + if left_child or right_child: + yield DiffNode.create_node(left_child, right_child) + + yield DiffNode.create_node(left_nodes[l], right_nodes[r]) + l_iter = l + 1 + r_iter = r + 1 + # TODO: fill unknown nodes in case of the start_time of next node and current + # end time is bigger than threshold. + # Or do we need move the logic into frondend for visualization? + + # process the remaining nodes + left_remaining = left_nodes[l_iter:] + right_remaining = right_nodes[r_iter:] + if left_remaining or right_remaining: + yield DiffNode.create_node(left_remaining, right_remaining) + + +def compare_op_tree(left: OperatorNode, right: OperatorNode) -> DiffNode: + '''Create the diff tree from two root node + TODO: need handle the different threads case + need add runtimes besides of children? + ''' + left_children = list(get_tree_operators(left)) + right_children = list(get_tree_operators(right)) + return DiffNode.create_node(left_children, right_children) + + +def get_tree_operators(root: OperatorNode) -> Generator[OperatorNode, None, None]: + '''Get the operators by the root operators by excluding the ProfilerStepNode + ''' + profiler_nodes = [c for c in root.children if isinstance(c, ProfilerStepNode)] + if not profiler_nodes: + # there is no ProfilerStepNode at all + yield from root.children + else: + yield from (child for p in profiler_nodes for child in p.children) + + +def diff_summary(node: DiffNode) -> DiffStats: + if not node: + return None + + left = OpStats( + node.left.name, + node.left.duration, + node.left.device_duration, + node.left.total_duration, + list(node.left.aggregate_ops())) + right = OpStats( + node.right.name, + node.right.duration, + node.right.device_duration, + node.right.total_duration, + list(node.right.aggregate_ops())) + + stats = DiffStats(left, right) + for child in node.children: + stats.children.append(diff_summary(child)) + + return stats + + +def print_node(node: Union[DiffNode, DiffStats], level: int, index: int, file=sys.stdout): + file.write(f'{INDENT * level}level {level}, index {index}:\n') + file.write(f'{INDENT * (level + 1)}left : {node.left}\n') + file.write(f'{INDENT * (level + 1)}right: {node.right}\n') + for i, child in enumerate(node.children): + print_node(child, level + 1, i, file=file) + + +def print_ops(op: Operators, prefix: str = INDENT, file=sys.stdout): + if isinstance(op.op_nodes, list): + for n in op.op_nodes: + file.write(f'{prefix}{n.name}\n') + else: + file.write(f'{prefix}{op.op_nodes.name}\n') diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/event_parser.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/event_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..6df381ae447a702d14898c3cbeffde2001b41b07 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/event_parser.py @@ -0,0 +1,474 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# ------------------------------------------------------------------------- +import sys +from collections import defaultdict +from enum import IntEnum +from typing import Dict, Iterable, List, Optional, Tuple + +from .. import utils +from .communication import generate_communication_nodes +from .node import (CommunicationNode, DeviceNode, ModuleNode, OperatorNode, PLModuleNode, PLProfileNode, + ProfilerStepNode, RuntimeNode, create_operator_node) +from .op_tree import OpTreeBuilder +from .range_utils import merge_ranges +from .trace import BaseEvent, DurationEvent, EventTypes, KernelEvent + +logger = utils.get_logger() + +NcclOpNameSet = ['nccl:broadcast', 'nccl:reduce', 'nccl:all_reduce', 'nccl:all_gather', 'nccl:reduce_scatter'] +GlooOpNameSet = ['gloo:broadcast', 'gloo:reduce', 'gloo:all_reduce', 'gloo:all_gather', 'gloo:reduce_scatter'] +CommLibTypes = IntEnum('CommLibTypes', ['Nccl', 'Gloo'], start=0) + + +class ProfileRole(IntEnum): + Kernel = 0 + Memcpy = 1 + Memset = 2 + Communication = 3 + Runtime = 4 + DataLoader = 5 + CpuOp = 6 + Other = 7 + Total = 8 + + +class NodeParserMixin: + def __init__(self, *args, **kwargs): + """Please refer to https://stackoverflow.com/questions/9575409/calling-parent-class-init-with-multiple-inheritance-whats-the-right-way # noqa: E501 + to see the reason why we need call super().__init__ like this way + """ + super().__init__(*args, **kwargs) + + self.communication_data: Dict[int, CommunicationNode] = {} + self.device_node_list: List[DeviceNode] = [] + self.runtime_node_list: List[RuntimeNode] = [] + self.used_devices = set() + self.use_dp = False + self.use_ddp = False + self.comm_lib = set() + + def parse_nodes(self, events: Iterable[BaseEvent]): + # For OperatorNode and ProfilerStepNode: + # Use time interval containing relationship to build father-child correlation, + # which is consistent with autograd profiler. + # For RuntimeNode: + # Use external_id to build correlation with its father OperatorNode or ProfilerStepNode. + # Because in the case when RuntimeNode has duration 0 and starts at same time as a OperatorNode, + # just use interval containing relationship can't tell it is child or brother of the OperatorNode. + # value is a list of OperatorNode and ProfilerStepNode. Do not include RuntimeNode + tid2list: Dict[int, List[OperatorNode]] = defaultdict(list) + # value is a list of PLProfileNode. Do not include RuntimeNode + pl_tid2list: Dict[int, List[PLProfileNode]] = defaultdict(list) + # value is a list of RuntimeNode with external_id=0. They will be attached to root nodes. + tid2zero_rt_list: Dict[int, List[RuntimeNode]] = defaultdict(list) + corrid_to_device: Dict[int, List[DeviceNode]] = defaultdict(list) # value is a list of DeviceNode + + corrid_to_runtime: Dict[int, RuntimeNode] = {} # value is a RuntimeNode + externalid_to_runtime: Dict[int, List[RuntimeNode]] = defaultdict(list) # value is a list of RuntimeNode + + for event in events: + if event.type == EventTypes.MEMORY: + continue + self._parse_node( + event, + corrid_to_device, + corrid_to_runtime, + externalid_to_runtime, + tid2list, + pl_tid2list, + tid2zero_rt_list) + + if CommLibTypes.Nccl in self.comm_lib: + for event in events: + if event.type == EventTypes.KERNEL: + self._update_communication_node(event) + + # associate CUDA Runtimes with CPU events + for op_list in tid2list.values(): + for op in op_list: + runtime_nodes = externalid_to_runtime.pop(op.external_id, []) + if runtime_nodes: + op.runtimes.extend(runtime_nodes) + for ext_id in externalid_to_runtime: + if ext_id != 0: + logger.warning("{} Runtime with external id {} don't correlate to any operator!".format( + len(externalid_to_runtime[ext_id]), ext_id)) + + if len(corrid_to_device) > 0: + node_count_dict = defaultdict(int) + for nodes in corrid_to_device.values(): + for n in nodes: + node_count_dict[n.type] += 1 + + logger.debug(("Some events doesn't belongs to any operators: " + f"{', '.join([':'.join((k, str(v))) for k, v in node_count_dict.items()])}")) + + staled_device_nodes = [] + for device_nodes in corrid_to_device.values(): + staled_device_nodes.extend([n for n in device_nodes if n.type == EventTypes.KERNEL]) + + return tid2list, tid2zero_rt_list, staled_device_nodes, pl_tid2list + + def _update_communication_node(self, event: KernelEvent): + """Update the communication node by using the TraceEvent instance""" + external_id = event.external_id + comm_node = self.communication_data.get(external_id) + if comm_node: + ts = event.ts + dur = event.duration + comm_node.kernel_ranges.append((ts, ts + dur)) + comm_node.total_time += dur + + return comm_node is not None + + def _parse_node(self, + event: DurationEvent, + corrid_to_device: Dict[int, List[DeviceNode]], + corrid_to_runtime: Dict[int, RuntimeNode], + externalid_to_runtime: Dict[int, List[RuntimeNode]], + tid2list: Dict[int, List[OperatorNode]], + pl_tid2list: Dict[int, List[PLProfileNode]], + tid2zero_rt_list: Dict[int, List[RuntimeNode]]): + corrid = event.correlation_id + tid = event.tid + if event.type in [EventTypes.KERNEL, EventTypes.MEMCPY, EventTypes.MEMSET]: + self.used_devices.add(event.pid) + device_node = DeviceNode.create(event) + if corrid in corrid_to_runtime: + rt_node = corrid_to_runtime[corrid] # Don't pop it because it may be used by next kernel. + if rt_node.device_nodes is None: + rt_node.device_nodes = [] + rt_node.device_nodes.append(device_node) + + # Check the external_id + if rt_node.external_id != device_node.external_id: + logger.warning( + 'Runtime and Device-op have same correlation id %s but with different external id!' + ' (runtime external_id, device external_id): (%s, %s)' % + (corrid, rt_node.external_id, device_node.external_id)) + else: + corrid_to_device[corrid].append(device_node) + self.device_node_list.append(device_node) + elif event.type == EventTypes.RUNTIME: + device_nodes = corrid_to_device.pop(corrid, None) + rt_node = RuntimeNode.create(event, device_nodes) + corrid_to_runtime[corrid] = rt_node + externalid_to_runtime[rt_node.external_id].append(rt_node) + # Some runtimes has external_id 0, which will not be correlated to any operator. + # So get them and attach them to root node. + if rt_node.external_id == 0: + tid2zero_rt_list[tid].append(rt_node) + self.runtime_node_list.append(rt_node) + + # check the external_id + if device_nodes: + for device_node in device_nodes: + if rt_node.external_id != device_node.external_id: + logger.warning( + 'Runtime and Device-op have same correlation id %s but with different external id!' + ' (rt external_id, device external_id): (%s, %s)' % + (corrid, rt_node.external_id, device_node.external_id)) + elif event.type in [EventTypes.PYTHON, + EventTypes.OPERATOR, + EventTypes.PL_MODULE, + EventTypes.PROFILER_STEP, + EventTypes.MODULE]: + if event.type == EventTypes.PROFILER_STEP: + op_node = ProfilerStepNode.create(event) + elif event.type == EventTypes.MODULE: + op_node = ModuleNode.create(event) + elif event.type == EventTypes.PL_MODULE: + op_node = PLModuleNode.create(event) + else: + op_node = create_operator_node(event) + if event.name in NcclOpNameSet or event.name in GlooOpNameSet: + comm_node = CommunicationNode.create(event) + if event.name in NcclOpNameSet: + self.comm_lib.add(CommLibTypes.Nccl) + if event.name in GlooOpNameSet: + self.comm_lib.add(CommLibTypes.Gloo) + ts = event.ts + dur = event.duration + comm_node.kernel_ranges.append((ts, ts + dur)) + comm_node.total_time = dur + self.communication_data[op_node.external_id] = comm_node + if event.name == 'DataParallel.forward': + self.use_dp = True + if event.name == 'DistributedDataParallel.forward': + self.use_ddp = True + tid2list[int(tid)].append(op_node) + elif event.type == EventTypes.PL_PROFILE: + op_node = PLProfileNode.create(event) + pl_tid2list[int(tid)].append(op_node) + + +class StepParser: + def __init__(self): + # we could not use [[]] * len here since they all point to same memory + # https://stackoverflow.com/questions/12791501/python-initializing-a-list-of-lists + # https://stackoverflow.com/questions/240178/list-of-lists-changes-reflected-across-sublists-unexpectedly + self.role_ranges: List[List[Tuple[int, int]]] = [[] for _ in range(ProfileRole.Total - 1)] + self.steps: List[Tuple[int, int]] = [] + self.steps_names: List[str] = [] + self.cpu_min_ts = sys.maxsize # Min time of CPU side events. + self.cpu_max_ts = -sys.maxsize - 1 # Max time of CPU side events. + self.global_min_ts = sys.maxsize # Min time of all events. + self.global_max_ts = -sys.maxsize - 1 # Max time of all events. + # The below two form time range for adding gpu utilization to trace view. + # Use 'PyTorch Profiler (0)' as them. + # If not exists, assign global_min_ts and global_max_ts to them. + self.global_start_ts = sys.maxsize + self.global_end_ts = -sys.maxsize - 1 + + def parse_steps(self, events: Iterable[DurationEvent], comm_nodes: Dict[int, CommunicationNode]): + for event in events: + if event.type == EventTypes.MEMORY: + continue + + self._parse_step(event, comm_nodes) + if event.type == EventTypes.TRACE and event.name == 'PyTorch Profiler (0)': + self.global_start_ts = event.ts + self.global_end_ts = event.ts + event.duration + if self.global_start_ts == sys.maxsize: + self.global_start_ts = self.global_min_ts + if self.global_end_ts == -sys.maxsize - 1: + self.global_end_ts = self.global_max_ts + + if len(self.steps) == 0: + self.steps.append((self.cpu_min_ts, self.cpu_max_ts)) + self.steps_names.append('0') + + for i in range(len(self.role_ranges)): + self.role_ranges[i] = merge_ranges(self.role_ranges[i]) + + def update_device_steps(self, runtime_node_list: List[RuntimeNode]): + self._update_steps_duration(*self._find_device_steps(runtime_node_list)) + + @property + def has_runtime(self): + return bool(self.role_ranges[ProfileRole.Runtime]) + + @property + def has_kernel(self): + return bool(self.role_ranges[ProfileRole.Kernel]) + + @property + def has_communication(self): + return bool(self.role_ranges[ProfileRole.Communication]) + + @property + def has_memcpy_or_memset(self): + return bool(self.role_ranges[ProfileRole.Memcpy] or self.role_ranges[ProfileRole.Memset]) + + def _parse_step(self, event: DurationEvent, comm_nodes: Dict[int, CommunicationNode]): + ts = event.ts + dur = event.duration + evt_type = event.type + if evt_type == EventTypes.KERNEL: + if event.external_id in comm_nodes: + self.role_ranges[ProfileRole.Communication].append((ts, ts + dur)) + else: + self.role_ranges[ProfileRole.Kernel].append((ts, ts + dur)) + elif evt_type == EventTypes.MEMCPY: + self.role_ranges[ProfileRole.Memcpy].append((ts, ts + dur)) + elif evt_type == EventTypes.MEMSET: + self.role_ranges[ProfileRole.Memset].append((ts, ts + dur)) + elif evt_type == EventTypes.RUNTIME: + self.role_ranges[ProfileRole.Runtime].append((ts, ts + dur)) + elif (evt_type == EventTypes.OPERATOR and ( + (event.name.startswith('enumerate(DataLoader)#') and event.name.endswith('.__next__')) + or event.name.startswith('enumerate(DataPipe)#'))): + self.role_ranges[ProfileRole.DataLoader].append((ts, ts + dur)) + elif event.type == EventTypes.PROFILER_STEP: + self.steps.append((ts, ts + dur)) + self.steps_names.append(str(event.step)) + elif evt_type in [EventTypes.PYTHON, EventTypes.OPERATOR]: + if event.name in GlooOpNameSet: + self.role_ranges[ProfileRole.Communication].append((ts, ts + dur)) + else: + self.role_ranges[ProfileRole.CpuOp].append((ts, ts + dur)) + + # Record host side min and max time. + if evt_type in [EventTypes.PYTHON, EventTypes.OPERATOR, EventTypes.PROFILER_STEP]: + self.cpu_min_ts = min(self.cpu_min_ts, ts) + self.cpu_max_ts = max(self.cpu_max_ts, ts + dur) + # Record global wise min and max time. + self.global_min_ts = min(self.global_min_ts, ts) + self.global_max_ts = max(self.global_max_ts, ts + dur) + + def _find_device_steps(self, runtime_node_list: List[RuntimeNode]): + """return steps associated with device nodes. + """ + runtime_node_list = sorted(runtime_node_list, key=lambda x: x.start_time) + + # Use similar code with two-way merge to get all runtimes inside each host-side step span, + # then record each step's min kernel start time and max kernel end time: + steps_device: List[Tuple[int, int]] = [(sys.maxsize, -sys.maxsize - 1)] * len(self.steps) + # where the steps associated with devcie node, if yes, the related array item is larger than 0. + steps_matched_device_nodes: List[int] = [0] * len(self.steps) + + i_step = 0 + i_runtime = 0 + step_device_min_ts = sys.maxsize + step_device_max_ts = -sys.maxsize - 1 + matched_device_nodes = set() + + while i_step < len(self.steps) and i_runtime < len(runtime_node_list): + step_host_start_time = self.steps[i_step][0] + step_host_end_time = self.steps[i_step][1] + if runtime_node_list[i_runtime].start_time < step_host_start_time: + # This runtime is ahead of or intersects with this step span. Skip this runtime. + i_runtime += 1 + elif runtime_node_list[i_runtime].end_time <= step_host_end_time: + # and runtime_node_list[i_runtime].start_time >= step_host_start_time + # This runtime is inside this step span. Scan its device_nodes. + rt = runtime_node_list[i_runtime] + if rt.device_nodes is not None: + for device_node in rt.device_nodes: + step_device_min_ts = min(device_node.start_time, step_device_min_ts) + step_device_max_ts = max(device_node.end_time, step_device_max_ts) + matched_device_nodes.add(device_node) + steps_matched_device_nodes[i_step] += 1 + i_runtime += 1 + elif runtime_node_list[i_runtime].start_time < step_host_end_time: + # and runtime_node_list[i_runtime].end_time > step_host_end_time + # This runtime intersects with this step span. Skip this runtime. + i_runtime += 1 + else: + # runtime_node_list[i_runtime].start_time >= step_host_end_time + # This runtime starts after this step's end. Record and move forward this step. + steps_device[i_step] = (step_device_min_ts, step_device_max_ts) + i_step += 1 + step_device_min_ts = sys.maxsize + step_device_max_ts = -sys.maxsize - 1 + + while i_step < len(self.steps): + # This step doesn't launch any device side event, just assign it as empty. + steps_device[i_step] = (step_device_min_ts, step_device_max_ts) + step_device_min_ts = sys.maxsize + step_device_max_ts = -sys.maxsize - 1 + i_step += 1 + + # If there are matched device, find the first step end time before steps_device[0][0] + prev_step_end_time: Optional[int] = None + if len(matched_device_nodes) > 0: + prev_step_end_time = self.steps[0][0] + if steps_device[0][0] != sys.maxsize: # When step 0 has device event. + for device_node in self.device_node_list: + if device_node not in matched_device_nodes: + # Now this device_node is not launched inside any step span. + if device_node.end_time < steps_device[0][0]: + prev_step_end_time = max(prev_step_end_time, device_node.end_time) + + return prev_step_end_time, steps_device, steps_matched_device_nodes + + def _update_steps_duration(self, + prev_step_end_time: Optional[int], + steps_device: List[Tuple[int, int]], + steps_matched_device_nodes: List[int]): + """Update self.steps considering device side events launched by each host side step. + Update self.steps_names if some tail steps are removed.""" + + # Change step time to device side on the condition that any step have device time. + is_use_gpu = prev_step_end_time is not None + if is_use_gpu: + for i_step in range(len(self.steps)): + step_start_time = max(prev_step_end_time, self.steps[i_step][0]) + step_end_time = self.steps[i_step][1] + if steps_device[i_step][0] == sys.maxsize: # When step i_step has no device event. + # Assign to step_start_time when kernel is behind host step end. + step_end_time = max(step_end_time, step_start_time) + else: + step_end_time = max(step_end_time, steps_device[i_step][1]) + if step_end_time < step_start_time: + logger.warning( + 'Abnormal step_end_time of step {}: [{}, {}]'.format( + i_step, step_start_time, step_end_time)) + step_end_time = step_start_time + self.steps[i_step] = (step_start_time, step_end_time) # Update step time considering device side. + prev_step_end_time = step_end_time + + is_remove_tail_steps = True # TODO: Use tensorboard argument instead. + if is_use_gpu and len(self.steps) > 1 and is_remove_tail_steps: + i_step = len(self.steps) - 1 + while i_step >= 0: + if steps_matched_device_nodes[i_step] > 0: + break + i_step -= 1 + if i_step >= 0: + keep_steps = i_step + 1 + if i_step > 0 and steps_matched_device_nodes[i_step - 1] * 0.8 > steps_matched_device_nodes[i_step]: + keep_steps = i_step + if keep_steps < len(self.steps): + logger.warning( + 'Remove the last {} steps from overview. ' + 'Because the profiler may fail to capture all the kernels launched by these steps.'.format( + len(self.steps) - keep_steps + )) + self.steps = self.steps[:keep_steps] + self.steps_names = self.steps_names[:keep_steps] + + +class EventParser(NodeParserMixin, StepParser): + def __init__(self): + super().__init__() + self.comm_node_list: Dict[CommunicationNode] = None + + def parse(self, events: Iterable[BaseEvent], fwd_bwd_map: Dict[int, int]) -> Dict[int, List[OperatorNode]]: + with utils.timing('EventParser: parse nodes'): + tid2list, tid2zero_rt_list, staled_device_nodes, pl_tid2list = self.parse_nodes(events) + + with utils.timing('EventParser: build operator tree'): + builder = OpTreeBuilder() + tid2tree = builder.build_tree(tid2list, tid2zero_rt_list, staled_device_nodes, fwd_bwd_map=fwd_bwd_map) + pl_tid2tree = builder.build_tree(pl_tid2list, {}, [], {}) + + with utils.timing('EventParser: parse steps times'): + # Process steps + self.parse_steps(events, self.communication_data) + if len(self.comm_lib) > 1: + logger.warning( + 'Multiple communication libs are found. To avoid confusing, we disable the distributed view.') + self.communication_data.clear() + + # Move the interleaved logic out of each NodeParser and StepParser + self.update_device_steps(self.runtime_node_list) + + self.comm_node_list = generate_communication_nodes(self.communication_data, self.steps, self.steps_names) + return tid2tree, pl_tid2tree + + @staticmethod + def print_tree(root): + class Ctx: + tid: int = -1 + name_stack: list = [] + + ctx = Ctx() + + def print_node_set_prefix(node: OperatorNode): + header = f'[{ctx.tid}]' + '.'.join(ctx.name_stack[1:]) # omit the CallTreeRoot + prefix_len = len(ctx.name_stack) * 4 - 4 - 1 + if len(ctx.name_stack) > 1: + print(header) + prefix = ' ' * prefix_len + print(prefix, node.name) + print(prefix, 'time:', node.start_time, '-->', node.end_time) + + def push(node: OperatorNode): + ctx.name_stack.append(node.name) + + def pop(): + ctx.name_stack.pop() + + def traverse_opeartor_node(node: OperatorNode): + print_node_set_prefix(node) + + push(node) + for n in node.children: + traverse_opeartor_node(n) + pop() + + ctx.tid = root.tid + traverse_opeartor_node(root) + ctx.tid = -1 diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/gpu_metrics_parser.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/gpu_metrics_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..74e7b54dfc53bb2979df6e714f25600560c992cc --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/gpu_metrics_parser.py @@ -0,0 +1,314 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# -------------------------------------------------------------------------- +from typing import Iterable, List + +from .. import consts, utils +from .range_utils import (get_ranges_sum, intersection_ranges_lists, + intersection_ranges_lists_with_value, merge_ranges, + merge_ranges_with_value) +from .trace import BaseEvent, EventTypes, KernelEvent + +logger = utils.get_logger() + + +# For calculating GPU utilization, and approximated SM efficiency. +class GPUMetricsParser(object): + def __init__(self): + # All gpu ids that used by any kernel. + self.gpu_ids = set() + # For calculating GPU utilization. + self.kernel_ranges_per_device = [[] for _ in range(consts.MAX_GPU_PER_NODE)] + self.gpu_utilization = [None] * consts.MAX_GPU_PER_NODE + self.gpu_util_timeline_unit_size = 0 + self.gpu_util_timeline_unit_name = '' + self.gpu_util_buckets = [[] for _ in range(consts.MAX_GPU_PER_NODE)] + # For calculating approximated SM efficiency. + self.blocks_per_sm_per_device = [[] for _ in range(consts.MAX_GPU_PER_NODE)] + self.avg_approximated_sm_efficiency_per_device = [None] * consts.MAX_GPU_PER_NODE + self.approximated_sm_efficiency_ranges = [[] for _ in range(consts.MAX_GPU_PER_NODE)] + self.gpu_sm_efficiency_json = None + self.blocks_per_sm_count = [0] * consts.MAX_GPU_PER_NODE + # For calculating averaged occupancy. + self.occupancy_per_device = [[] for _ in range(consts.MAX_GPU_PER_NODE)] + self.avg_occupancy_per_device = [None] * consts.MAX_GPU_PER_NODE + self.occupancy_count = [0] * consts.MAX_GPU_PER_NODE + + def calculate_gpu_utilization(self, global_start_time, global_end_time, steps_start_time, steps_end_time): + # Make bucket_size to 10-power's of us, and number of buckets to (10, 100]. + # 10-power's of us, in order to straight forward for user to understand. + # If number of buckets are too many, the value of gpu utilization will be either 0 or 1. + def get_bucket_info(range_micro_seconds): + max_buckets = 100 + bucket_size = 1 + while range_micro_seconds / bucket_size > max_buckets: + bucket_size *= 10 + buckets = int(range_micro_seconds / bucket_size) + unit = bucket_size + unit_str = 'us' + if unit >= 1000: + unit /= 1000 + unit_str = 'ms' + if unit >= 1000: + unit /= 1000 + unit_str = 's' + return int(bucket_size), int(buckets), int(unit), unit_str + + gpu_utilization_timeline = [[] for _ in range(consts.MAX_GPU_PER_NODE)] + for gpu_id in self.gpu_ids: + self.kernel_ranges_per_device[gpu_id] = merge_ranges(self.kernel_ranges_per_device[gpu_id]) + + # Top-level number still consider steps, to be consistent with overview's breakdown. + kernel_ranges_all_steps = intersection_ranges_lists( + self.kernel_ranges_per_device[gpu_id], [(steps_start_time, steps_end_time)]) + ranges_sum = get_ranges_sum(kernel_ranges_all_steps) + self.gpu_utilization[gpu_id] = ranges_sum / (steps_end_time - steps_start_time) + + # The timeline will use 'PyTorch Profiler (0)' as start, + # in order to draw previous step's kernels' gpu utilization. + bucket_size, buckets, self.gpu_util_timeline_unit_size, self.gpu_util_timeline_unit_name = \ + get_bucket_info(global_end_time - global_start_time) + buckets_ranges = [] + for i in range(buckets): + buckets_ranges.append((global_start_time + i * bucket_size, + global_start_time + (i + 1) * bucket_size if i < buckets - 1 + else global_end_time)) # The last bucket may be longer. + gpu_utilization_timeline[gpu_id] = [0] * buckets + if len(self.kernel_ranges_per_device[gpu_id]) > 0: + current_range_index = 0 + current_range = self.kernel_ranges_per_device[gpu_id][current_range_index] + current_bucket_index = 0 + current_bucket = buckets_ranges[0] + while (current_range_index < len(self.kernel_ranges_per_device[gpu_id]) + and current_bucket_index < buckets): + if current_bucket[1] <= current_range[0]: + current_bucket_index += 1 + current_bucket = buckets_ranges[current_bucket_index] if current_bucket_index < buckets \ + else None + elif current_bucket[0] >= current_range[1]: + current_range_index += 1 + if current_range_index < len(self.kernel_ranges_per_device[gpu_id]): + current_range = self.kernel_ranges_per_device[gpu_id][current_range_index] + else: + left_bound = max(current_range[0], current_bucket[0]) + right_bound = min(current_range[1], current_bucket[1]) + gpu_utilization_timeline[gpu_id][current_bucket_index] += (right_bound - left_bound) + if current_bucket[1] < current_range[1]: + current_bucket_index += 1 + current_bucket = buckets_ranges[current_bucket_index] if current_bucket_index < buckets \ + else None + else: + current_range_index += 1 + if current_range_index < len(self.kernel_ranges_per_device[gpu_id]): + current_range = self.kernel_ranges_per_device[gpu_id][current_range_index] + for i_bucket in range(buckets): + bucket_size = buckets_ranges[i_bucket][1] - buckets_ranges[i_bucket][0] + gpu_utilization_timeline[gpu_id][i_bucket] /= bucket_size + start_time = buckets_ranges[i_bucket][0] + self.gpu_util_buckets[gpu_id].append((start_time, gpu_utilization_timeline[gpu_id][i_bucket])) + start_time = buckets_ranges[-1][1] + self.gpu_util_buckets[gpu_id].append((start_time, 0)) + + self.kernel_ranges_per_device = None # Release memory. + + def calculate_approximated_sm_efficiency(self, steps_start_time, steps_end_time): + def calculate_avg(approximated_sm_efficiency_ranges, total_dur): + total_weighted_sm_efficiency = 0.0 + for r in approximated_sm_efficiency_ranges: + dur = r[1] - r[0] + total_weighted_sm_efficiency += r[2] * dur + avg_approximated_sm_efficiency = total_weighted_sm_efficiency / total_dur + return avg_approximated_sm_efficiency + + total_dur = steps_end_time - steps_start_time + for gpu_id in self.gpu_ids: + blocks_per_sm_ranges = self.blocks_per_sm_per_device[gpu_id] + approximated_sm_efficiency_ranges = merge_ranges_with_value(blocks_per_sm_ranges) + # To be consistent with GPU utilization, here it must also intersect with all steps, + # in order to remove the kernels out of steps range. + approximated_sm_efficiency_ranges_all_steps = intersection_ranges_lists_with_value( + approximated_sm_efficiency_ranges, [(steps_start_time, steps_end_time)]) + if len(approximated_sm_efficiency_ranges_all_steps) > 0: + avg_approximated_sm_efficiency = calculate_avg(approximated_sm_efficiency_ranges_all_steps, total_dur) + self.avg_approximated_sm_efficiency_per_device[gpu_id] = avg_approximated_sm_efficiency + + # The timeline still uses all kernels including out of steps scope's. + if len(approximated_sm_efficiency_ranges) > 0: + self.approximated_sm_efficiency_ranges[gpu_id] = approximated_sm_efficiency_ranges + + self.blocks_per_sm_per_device = None # Release memory. + + # Weighted average. Weighted by kernel's time duration. + def calculate_occupancy(self, steps_start_time, steps_end_time): + for gpu_id in self.gpu_ids: + occupancys_on_a_device = self.occupancy_per_device[gpu_id] + total_time = 0 + total_occupancy = 0.0 + for r in occupancys_on_a_device: + min_time = max(r[0], steps_start_time) + max_time = min(r[1], steps_end_time) + if min_time < max_time: + dur = max_time - min_time + total_occupancy += r[2] * dur + total_time += dur + if total_time > 0: + self.avg_occupancy_per_device[gpu_id] = total_occupancy / total_time + + @classmethod + def parse_events(cls, + events: Iterable[BaseEvent], + global_start_time: int, + global_end_time: int, + steps_start_time: int, + steps_end_time: int): + parser = GPUMetricsParser() + logger.debug('GPU Metrics, parse events') + for event in events: + if event.type == EventTypes.KERNEL: + parser.parse_event(event) + + parser.calculate_gpu_utilization(global_start_time, global_end_time, steps_start_time, steps_end_time) + parser.calculate_approximated_sm_efficiency(steps_start_time, steps_end_time) + parser.calculate_occupancy(steps_start_time, steps_end_time) + return parser + + def parse_event(self, event: KernelEvent): + ts = event.ts + dur = event.duration + gpu_id = event.device_id + if gpu_id != event.pid: + logger.warning("pid '{}' is not equal to args.device '{}' on event with ts '{}'".format( + event.pid, gpu_id, event.ts)) + if gpu_id is not None: + if gpu_id not in self.gpu_ids: + self.gpu_ids.add(gpu_id) + self.kernel_ranges_per_device[gpu_id].append((ts, ts + dur)) + if event.blocks_per_sm is not None: + if event.blocks_per_sm > 0.0: + self.blocks_per_sm_per_device[gpu_id].append((ts, ts + dur, event.blocks_per_sm)) + self.blocks_per_sm_count[gpu_id] += 1 + else: + # Workaround for negative value input. + logger.warning('blocks per SM {} with ts {} is not positive!'.format(event.blocks_per_sm, ts)) + if event.occupancy is not None: + if event.occupancy >= 0.0: + self.occupancy_per_device[gpu_id].append((ts, ts + dur, event.occupancy)) + self.occupancy_count[gpu_id] += 1 + else: + # Workaround for negative value input. + logger.warning('est. achieved occupancy % {} with ts {} is negative!'.format(event.occupancy, ts)) + + def get_gpu_metrics_columns(self): + columns = [] + if self.has_blocks_per_sm: + columns.append({'type': 'number', 'name': 'Mean Blocks Per SM', + 'tooltip': consts.TOOLTIP_BLOCKS_PER_SM}) + if self.has_occupancy: + columns.append({'type': 'number', 'name': 'Mean Est. Achieved Occupancy (%)', + 'tooltip': consts.TOOLTIP_OCCUPANCY_COMMON + consts.TOOLTIP_OCCUPANCY_TABLE}) + return columns + + @property + def has_blocks_per_sm(self): + return sum(self.blocks_per_sm_count) > 0 + + @property + def has_occupancy(self): + return sum(self.occupancy_count) > 0 + + def get_gpu_metrics(self): + def build_trace_counter_gpu_util(gpu_id, start_time, counter_value): + util_json = ("{{\"ph\":\"C\", \"name\":\"GPU {} Utilization\", \"pid\":{}, \"ts\":{}, " + "\"args\":{{\"GPU Utilization\":{}}}}}").format(gpu_id, gpu_id, start_time, counter_value) + return util_json + + def build_trace_counter_sm_efficiency(gpu_id, start_time, counter_value): + util_json = ("{{\"ph\":\"C\", \"name\":\"GPU {} Est. SM Efficiency\", \"pid\":{}, \"ts\":{}, " + "\"args\":{{\"Est. SM Efficiency\":{}}}}}").format(gpu_id, gpu_id, start_time, counter_value) + return util_json + + def add_trace_counter_gpu_util(gpu_id, start_time, counter_value, counter_json_list: List): + json_str = build_trace_counter_gpu_util(gpu_id, start_time, counter_value) + counter_json_list.append(json_str) + + def add_trace_counter_sm_efficiency(gpu_id, start_time, end_time, value, counter_json_list: List): + efficiency_json_start = build_trace_counter_sm_efficiency(gpu_id, start_time, value) + efficiency_json_finish = build_trace_counter_sm_efficiency(gpu_id, end_time, 0) + counter_json_list.append(efficiency_json_start) + counter_json_list.append(efficiency_json_finish) + + counter_json_list = [] + for gpu_id, buckets in enumerate(self.gpu_util_buckets): + if len(buckets) > 0: + # Adding 1 as baseline. To avoid misleading virtualization when the max value is less than 1. + add_trace_counter_gpu_util(gpu_id, buckets[0][0], 1, counter_json_list) + add_trace_counter_gpu_util(gpu_id, buckets[0][0], 0, counter_json_list) + for b in buckets: + add_trace_counter_gpu_util(gpu_id, b[0], b[1], counter_json_list) + for gpu_id, ranges in enumerate(self.approximated_sm_efficiency_ranges): + buckets = self.gpu_util_buckets[gpu_id] + if len(ranges) > 0 and len(buckets) > 0: + # Adding 1 as baseline. To avoid misleading virtualization when the max value is less than 1. + add_trace_counter_sm_efficiency(gpu_id, buckets[0][0], buckets[0][0], 1, counter_json_list) + for r in ranges: + add_trace_counter_sm_efficiency(gpu_id, r[0], r[1], r[2], counter_json_list) + + return counter_json_list + + def get_gpu_metrics_data_tooltip( + self, + gpu_infos, + tc_ratio): + if not self.gpu_ids: + return None, None + + has_sm_efficiency = False + has_occupancy = False + has_tc = False + + gpu_metrics_data = [] + gpu_info_columns = ['Name', 'Memory', 'Compute Capability'] + + def process_gpu(gpu_id: int): + nonlocal has_sm_efficiency, has_occupancy, has_tc + gpu_metrics_data.append({'title': 'GPU {}:'.format(gpu_id), 'value': ''}) + gpu_info = gpu_infos.get(gpu_id, None) + if gpu_info is not None: + for key in gpu_info_columns: + if key in gpu_info: + gpu_metrics_data.append({'title': key, 'value': gpu_info[key]}) + else: + # the legacy chrome tracing file would not have gpu info. + pass + gpu_metrics_data.append({'title': 'GPU Utilization', 'value': '{} %'.format( + round(self.gpu_utilization[gpu_id] * 100, 2))}) + if self.avg_approximated_sm_efficiency_per_device[gpu_id] is not None: + gpu_metrics_data.append({'title': 'Est. SM Efficiency', 'value': '{} %'.format( + round(self.avg_approximated_sm_efficiency_per_device[gpu_id] * 100, 2))}) + has_sm_efficiency = True + if self.avg_occupancy_per_device[gpu_id] is not None: + gpu_metrics_data.append({'title': 'Est. Achieved Occupancy', 'value': '{} %'.format( + round(self.avg_occupancy_per_device[gpu_id], 2))}) + has_occupancy = True + if tc_ratio[gpu_id] is not None: + gpu_metrics_data.append({'title': 'Kernel Time using Tensor Cores', 'value': '{} %'.format( + round(tc_ratio[gpu_id] * 100, 2))}) + has_tc = True + + gpu_ids = list(self.gpu_ids) + process_gpu(gpu_ids[0]) + for idx in range(1, len(gpu_ids)): + # Append separator line for beautiful to see. + gpu_metrics_data.append({'title': '
', 'value': ''}) + process_gpu(gpu_ids[idx]) + + tooltip_summary = 'The GPU usage metrics:\n' + tooltip = '{}\n{}'.format(tooltip_summary, consts.TOOLTIP_GPU_UTIL) + if has_sm_efficiency: + tooltip += '\n' + consts.TOOLTIP_SM_EFFICIENCY + if has_occupancy: + tooltip += '\n' + consts.TOOLTIP_OCCUPANCY_COMMON + consts.TOOLTIP_OCCUPANCY_OVERVIEW + if has_tc: + tooltip += '\n' + consts.TOOLTIP_TENSOR_CORES + + return gpu_metrics_data, tooltip diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/kernel_parser.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/kernel_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..838fc38ce60619977c3e096791241d7fc697562d --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/kernel_parser.py @@ -0,0 +1,45 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# -------------------------------------------------------------------------- +from typing import Optional + +import numpy as np +import pandas as pd + +from .tensor_core import TC_Allowlist +from .trace import EventTypes + + +class KernelParser: + def __init__(self): + self.kernel_stat: Optional[pd.DataFrame] = None + self.tc_used_ratio = 0.0 + + def parse_events(self, events): + events = [vars(event) for event in events if event.type == EventTypes.KERNEL] + events = pd.DataFrame(events) + events = events.astype({'type': 'category', 'name': 'string'}, copy=False) + events['tc_used'] = events['name'].map(lambda name: name in TC_Allowlist) + + def weighted_avg(x: pd.Series): + try: + # fill these None as zero + x = x.fillna(0) + return np.average(x, weights=events.loc[x.index, 'duration']) + except ZeroDivisionError: + return 0 + + self.kernel_stat = events.groupby('name').agg( + tc_used=('tc_used', 'first'), + count=('duration', 'count'), + sum=('duration', 'sum'), + mean=('duration', 'mean'), + max=('duration', 'max'), + min=('duration', 'min'), + blocks_per_sm=('blocks_per_sm', weighted_avg), + occupancy=('occupancy', weighted_avg)).sort_values('sum', ascending=False) + + tc_total = self.kernel_stat['sum'].sum() + tc_self = self.kernel_stat[self.kernel_stat['tc_used']]['sum'].sum() + if tc_total > 0: + self.tc_used_ratio = tc_self / tc_total diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/loader.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/loader.py new file mode 100644 index 0000000000000000000000000000000000000000..424e62d7df8abdc26b13bf05472ca3a02991b84b --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/loader.py @@ -0,0 +1,166 @@ + +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# -------------------------------------------------------------------------- +import bisect +import os +import sys +from collections import defaultdict +from typing import List, Tuple + +from .. import consts, io, utils +from ..multiprocessing import Process, Queue +from ..run import Run, RunProfile +from .data import DistributedRunProfileData, RunProfileData +from .node import CommunicationNode +from .run_generator import DistributedRunGenerator, RunGenerator + +logger = utils.get_logger() + + +class RunLoader(object): + def __init__(self, name, run_dir, caches: io.Cache): + self.run_name = name + self.run_dir = run_dir + self.caches = caches + self.queue = Queue() + + def load(self): + workers = [] + spans_by_workers = defaultdict(list) + for path in io.listdir(self.run_dir): + if io.isdir(io.join(self.run_dir, path)): + continue + match = consts.WORKER_PATTERN.match(path) + if not match: + continue + + worker = match.group(1) + span = match.group(2) + if span is not None: + # remove the starting dot (.) + span = span[1:] + bisect.insort(spans_by_workers[worker], span) + + workers.append((worker, span, path)) + + span_index_map = {} + for worker, span_array in spans_by_workers.items(): + for i, span in enumerate(span_array, 1): + span_index_map[(worker, span)] = i + + for worker, span, path in workers: + # convert the span timestamp to the index. + span_index = None if span is None else span_index_map[(worker, span)] + p = Process(target=self._process_data, args=(worker, span_index, path)) + p.start() + logger.info('started all processing') + + distributed_run = Run(self.run_name, self.run_dir) + run = Run(self.run_name, self.run_dir) + num_items = len(workers) + while num_items > 0: + item: Tuple[RunProfile, DistributedRunProfileData] = self.queue.get() + num_items -= 1 + r, d = item + if r or d: + logger.debug('Loaded profile via mp.Queue') + if r is not None: + run.add_profile(r) + if d is not None: + distributed_run.add_profile(d) + + distributed_profiles = self._process_spans(distributed_run) + for d in distributed_profiles: + if d is not None: + run.add_profile(d) + + # for no daemon process, no need to join them since it will automatically join + return run + + def _process_data(self, worker, span, path): + import absl.logging + absl.logging.use_absl_handler() + + try: + logger.debug('Parse trace, run_dir=%s, worker=%s', self.run_dir, path) + local_file = self.caches.get_remote_cache(io.join(self.run_dir, path)) + data = RunProfileData.parse(worker, span, local_file, self.caches.cache_dir) + if data.trace_file_path != local_file: + self.caches.add_file(local_file, data.trace_file_path) + + generator = RunGenerator(worker, span, data) + profile = generator.generate_run_profile() + dist_data = DistributedRunProfileData(data) + + logger.debug('Sending back profile via mp.Queue') + self.queue.put((profile, dist_data)) + except KeyboardInterrupt: + logger.warning('tb_plugin receive keyboard interrupt signal, process %d will exit' % (os.getpid())) + sys.exit(1) + except Exception as ex: + logger.warning('Failed to parse profile data for Run %s on %s. Exception=%s', + self.run_name, worker, ex, exc_info=True) + self.queue.put((None, None)) + logger.debug('finishing process data') + + def _process_spans(self, distributed_run: Run): + spans = distributed_run.get_spans() + if spans is None: + return [self._process_distributed_profiles(distributed_run.get_profiles(), None)] + else: + span_profiles = [] + for span in spans: + profiles = distributed_run.get_profiles(span=span) + p = self._process_distributed_profiles(profiles, span) + if p is not None: + span_profiles.append(p) + return span_profiles + + def _process_distributed_profiles(self, profiles: List[DistributedRunProfileData], span): + has_communication = True + comm_node_lists: List[List[CommunicationNode]] = [] + for data in profiles: + logger.debug('Processing profile data') + # Set has_communication to False and disable distributed view if any one worker has no communication + if data.has_communication and data.comm_node_list: + comm_node_lists.append(data.comm_node_list) + if len(comm_node_lists[-1]) != len(comm_node_lists[0]): + logger.error("Number of communication operation nodes don't match between workers in run: %s" + % self.run_name) + has_communication = False + else: + has_communication = False + logger.debug('Processing profile data finish') + + if not has_communication: + logger.debug('There is no communication profile in this run.') + return None + + worker_num = len(comm_node_lists) + for i, node in enumerate(comm_node_lists[0]): + kernel_range_size = len(node.kernel_ranges) + # loop for all communication kernel ranges in order + for j in range(kernel_range_size): + min_range = sys.maxsize + # For each kernel_range, find the minist between workers as the real communication time + for k in range(worker_num): + kernel_ranges = comm_node_lists[k][i].kernel_ranges + if len(kernel_ranges) != kernel_range_size: + logger.error("Number of communication kernels don't match between workers in run: %s" + % self.run_name) + has_communication = False + return None + if kernel_ranges: + if kernel_ranges[j][1] - kernel_ranges[j][0] < min_range: + min_range = kernel_ranges[j][1] - kernel_ranges[j][0] + for k in range(worker_num): + kernel_range = comm_node_lists[k][i].kernel_ranges[j] + comm_node_lists[k][i].real_time_ranges.append((kernel_range[1] - min_range, kernel_range[1])) + + for data in profiles: + data.communication_parse() + + generator = DistributedRunGenerator(profiles, span) + profile = generator.generate_run_profile() + return profile diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/memory_parser.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/memory_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..014542c56892b41dd2368dd7d1b52aa0bf897325 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/memory_parser.py @@ -0,0 +1,328 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# -------------------------------------------------------------------------- +from collections import defaultdict +from enum import IntEnum +from typing import Dict, Iterable, List, Optional, Tuple + +from .. import utils +from .node import OperatorNode, is_operator_node +from .op_agg import aggregate_ops +from .trace import DeviceType, MemoryEvent + +logger = utils.get_logger() + + +class MemoryMetrics(IntEnum): + SelfIncreaseSize = 0 + SelfAllocationSize = 1 + SelfAllocationCount = 2 + IncreaseSize = 3 + AllocationSize = 4 + AllocationCount = 5 + + +class MemoryRecord: + def __init__(self, scope: str, pid: int, tid: int, ts: int, + device_type: DeviceType, device_id: int, + address: int, bytes: int, total_allocated: float, total_reserved: float): + self.scope = scope + self.tid = tid + self.pid = pid + self.ts = ts + self.device_type = device_type + self.device_id = device_id + self.addr = address + self.bytes = bytes + self.total_allocated = total_allocated + self.total_reserved = total_reserved + self.op_name: Optional[str] = None + self.parent_op_name: Optional[str] = None + + @property + def device_name(self): + if self.device_type == DeviceType.CPU: + return 'CPU' + elif self.device_type == DeviceType.CUDA: + return 'GPU{}'.format(self.device_id) + else: + return None + + @property + def is_allocation(self): + return self.bytes > 0 + + @property + def op_name_or_unknown(self): + return self.op_name if self.op_name else '' + + @classmethod + def from_event(cls, event: MemoryEvent): + return cls(event.scope, event.pid, event.tid, event.ts, event.device_type, event.device_id, + event.addr, event.bytes, event.total_allocated, event.total_reserved) + + def __repr__(self) -> str: + return f"<{'+' if self.bytes>0 else ''}{self.bytes}B, addr: {self.addr}, ts: {self.ts}>" + + +class MemorySnapshot: + def __init__(self, memory_records: Iterable[MemoryRecord], + op_memory_table: Dict[OperatorNode, List[MemoryRecord]], + processed_nodes: Dict[OperatorNode, int]) -> None: + self.memory_records = memory_records + self.op_memory_table = op_memory_table + # the visited node times from parent to child + # troubleshooting issue purpose. + self.processed_node = processed_nodes + self.unreached_node = defaultdict(list) + + def get_peak_memory(self) -> Dict[Tuple[DeviceType, int], int]: + peaks = defaultdict(int) + for r in self.memory_records: + if r.total_allocated == r.total_allocated: # !isnan + peaks[(r.device_type, r.device_id)] = max(peaks[(r.device_type, r.device_id)], r.total_allocated) + return peaks + + def get_memory_statistics(self, + tid2tree: Dict[int, OperatorNode], + start_ts=None, end_ts=None) -> Dict[str, Dict[str, List[int]]]: + metric_length = len(MemoryMetrics) + self_metric_length = metric_length // 2 + + def dict_factory(): + return defaultdict(lambda: [0] * metric_length) + + # traverse outputs + op_list: List[OperatorNode] = [] + # two level keys dictionary + # first keyed by node, then keyed by device (CPU/GPU0/GPU1/etc.) + memory_metrics_keyed_by_node: Dict[OperatorNode, Dict[str, List[int]]] = defaultdict(dict_factory) + + def traverse_node_memory(node: OperatorNode): + if start_ts is not None and node.end_time < start_ts: + return + if end_ts is not None and node.start_time > end_ts: + return + + is_op = is_operator_node(node) + if is_op: + op_list.append(node) + + if node not in self.processed_node: + self.unreached_node[tid].append(node) + # since the node has not been visited for insert memory records, just ignore all childrens + return + elif is_op: + node_memory_metrics = self.get_memory_metrics(node, start_ts, end_ts) + for device, metrics in node_memory_metrics.items(): + # device is name of device like: CPU/GPU0 + # metrics is an arrary [SelfIncreaseSize, SelfAllocationSize, SelfAllocationCount] + for i, value in enumerate(metrics): + memory_metrics_keyed_by_node[node][device][i] = value + memory_metrics_keyed_by_node[node][device][i + self_metric_length] += value + else: + logger.debug('node {}:{} is not operator node, will skip its self metrics processing'.format( + node.name, node.start_time)) + + # recursive the children nodes + for child in node.children: + traverse_node_memory(child) + # sum up the child metrics + for device, metrics in memory_metrics_keyed_by_node[child].items(): + for i in range(self_metric_length, metric_length): + memory_metrics_keyed_by_node[node][device][i] += metrics[i] + + for tid, root in tid2tree.items(): + for child in root.children: + traverse_node_memory(child) + + # keyed first by device name like CPU/GPU0 etc, then keyed by operator name. + # the value is array [items indexed by MemoryMetrics] + memory_metrics_keyed_by_nodename: Dict[str, Dict[str, List[int]]] = defaultdict(dict_factory) + # node: the instance, device_keyed_metrics: dictionary keyed by device name like CPU/GPU0 + for node, device_keyed_metrics in memory_metrics_keyed_by_node.items(): + if not is_operator_node(node): + # skip the node like Optimizer.step, DataLoader, ProfilerStep#1 etc. + continue + + for device, metrics in device_keyed_metrics.items(): + for i, metric in enumerate(metrics): + memory_metrics_keyed_by_nodename[device][node.name][i] += metric + + # get the op_calls dictionary from module parser result. + op_calls: Dict[str, int] = defaultdict(int) + agg_result = aggregate_ops(op_list, [lambda op: op.name]) + for op_name, op_agg in agg_result[0].items(): + op_calls[op_name] += op_agg.calls + + result: Dict[str, Dict[str, List[int]]] = defaultdict(defaultdict) + for device, node_metrics in memory_metrics_keyed_by_nodename.items(): + for node, values in node_metrics.items(): + if any(values): + result[device][node] = values + [op_calls[node]] + + return result + + def get_memory_metrics(self, op: OperatorNode, start_ts, end_ts): + metrics_count = len([e.name for e in MemoryMetrics if e.name.startswith('Self')]) + memory_metrics: Dict[str, List[int]] = defaultdict(lambda: [0] * metrics_count) + for record in self.op_memory_table[op]: + if start_ts is not None and record.ts < start_ts: + continue + if end_ts is not None and record.ts > end_ts: + continue + name = record.device_name + if name is None: + continue + + memory_metrics[name][MemoryMetrics.SelfIncreaseSize] += record.bytes + if record.bytes > 0: + memory_metrics[name][MemoryMetrics.SelfAllocationSize] += record.bytes + memory_metrics[name][MemoryMetrics.SelfAllocationCount] += 1 + + return memory_metrics + + +class MemoryParser: + def __init__(self, memory_events: Iterable[MemoryEvent]): + # statistics purpose + self.staled_records: List[MemoryRecord] = [] + self.processed_records: List[MemoryRecord] = [] + self.memory_records: List[MemoryRecord] = [MemoryRecord.from_event(e) for e in memory_events] + + def find_memory_nodes(self, tid2tree: Dict[int, OperatorNode]) -> MemorySnapshot: + records_by_tid: Dict[int, List[MemoryRecord]] = defaultdict(list) + for r in self.memory_records: + records_by_tid[r.tid].append(r) + + op_memory_table: Dict[OperatorNode, List[MemoryRecord]] = defaultdict(list) + processed_node = defaultdict(int) + + tree_height = 0 + for tid, records in records_by_tid.items(): + if not records: + continue + + # each item is (parent_node, child_index) that it is visiting. + node_stack: List[Tuple[OperatorNode, int]] = [] + + record_index = 0 + current_node: OperatorNode = tid2tree.get(tid) + child_index = 0 + + if current_node: + processed_node[current_node] += 1 + + while record_index < len(records): + """In the loop, one pass will process one record. The basic logic is: + It will search from the node that last visited since both the records and tree is ordered already + 1. it current node contains the records, then find the exactly child which just embrace it. + 2. otherwise, find the parent node and set the child_index, so that the parent node could continue from previous visited node. # noqa: E501 + 3. if there is not any node contains the records, then all remaining records will be ignored. + """ + record = records[record_index] + + if len(node_stack) > tree_height: + tree_height = len(node_stack) + + if current_node is None or current_node.start_time is None or current_node.end_time is None: + # 3. Ignore all remaining records. + logger.debug( + 'could not find the node for tid %d, timestamp: %d, record index: %d, total records: %d' % ( + record.tid, record.ts, record_index, len(records))) + self.staled_records.append(records[record_index]) + record_index += 1 + continue + + if record.ts < current_node.start_time: + # this should only happens for root node. + logger.debug('record timestamp %d is less that the start time of %s' % + (record.ts, current_node.name)) + # This record has no chance to be appended to following tree node. + self.staled_records.append(record) + record_index += 1 + continue + elif record.ts >= current_node.end_time: + # 2. pop parent node and update the child_index accordingly. + if len(node_stack) > 0: + current_node, child_index = node_stack.pop() + child_index += 1 + else: + # if there is not item in stack, set it to None + current_node = None + continue + + # 1. find the real node embrace the record. + # Find the node which contains the records from top to downmost. + while child_index < len(current_node.children): + if record.ts < current_node.children[child_index].start_time: + # if current record timestamp is less than the current child's startime, + # we will break the search and keep the child_index not change. So that next time + # we can continue from here. + # there is no any child contains the record.timestamp + # child_find is False at this case. + break + elif record.ts >= current_node.children[child_index].end_time: + # if the record timestamp is greater than the children end time, increment to next child + # until find one contains the record + child_index += 1 + else: + # current children contains the record + processed_node[current_node.children[child_index]] += 1 + + # push child index which will be visited, then continue the loop + node_stack.append((current_node, child_index)) + current_node = current_node.children[child_index] + child_index = 0 + + # the current_node is the one contains the record at this moment. + if is_operator_node(current_node): + op_memory_table[current_node].append(record) + # NOTE: only allocation record can be associated with op. Because deallocation happens at the end + # of a tensor's lifetime which is not deterministic. + if record.is_allocation: + record.op_name = current_node.name + if len(node_stack) > 0: + record.parent_op_name = node_stack[-1][0].name + self.processed_records.append(record) + else: + self.staled_records.append(record) + + # the record is processed + record_index += 1 + + # show summary information + if len(self.staled_records) > 0 and len(self.memory_records) > 0: + logger.debug('{} memory records are skipped in total {} memory records and only {} get processed'.format( + len(self.staled_records), len(self.memory_records), len(self.processed_records))) + if tree_height > 0: + logger.debug('max tree height is {}'.format(tree_height)) + + all_records = self.get_preprocessed_records() + return MemorySnapshot(all_records, op_memory_table, processed_node) + + def get_preprocessed_records(self): + memory_records = sorted(self.memory_records, key=lambda r: r.ts) + + alloc = {} # allocation events may or may not have paired free event + prev_ts = float('-inf') # ensure ordered memory records is ordered + for i, r in enumerate(memory_records): + if r.addr is None: + # profile json data prior to pytorch 1.10 do not have addr + # we should ignore them + continue + assert prev_ts <= r.ts + prev_ts = r.ts + addr = r.addr + size = r.bytes + if size > 0: + # Allocation event, to be matched with a Release event + alloc[addr] = i + else: + # Processing a Release event + if addr in alloc: + alloc_r = memory_records[alloc[addr]] + r.op_name = alloc_r.op_name + r.parent_op_name = alloc_r.parent_op_name + del alloc[addr] + return memory_records diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/module_op.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/module_op.py new file mode 100644 index 0000000000000000000000000000000000000000..68e74d578116b9deff4aef4efc36933c52250247 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/module_op.py @@ -0,0 +1,269 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# ------------------------------------------------------------------------- +from collections import namedtuple +from typing import Dict, Generator, Iterable, List, Optional, Set, Tuple, Union + +from .node import (DataLoaderNode, ModuleNode, OperatorNode, OptimizerNode, + PLModuleNode, ProfilerStepNode, is_operator_node) +from .trace import BaseEvent, EventTypes, PLModuleEvent, PythonFunctionEvent + + +class Module: + def __init__(self, name: str, module_id: int, shape: str = ''): + self.name = name + self.module_id = module_id + self.children: List[Module] = [] + + def __hash__(self): + return hash((self.name, self.module_id, tuple(self.children))) + + def __eq__(self, o) -> bool: + if not isinstance(o, Module): + return False + + return (self.name == o.name and + self.module_id == o.module_id and + self.children == o.children) + + +class ModuleStats: + def __init__(self, name: str, module_id: int): + self.name = name + self.module_id = module_id + self.occurences: int = 0 + self.operators: int = 0 + self.host_duration: int = 0 + self.device_duration: int = 0 + self.self_host_duration: int = 0 + self.self_device_duration: int = 0 + + @property + def avg_host_duration(self): + return self.host_duration / self.occurences + + @property + def avg_device_duration(self): + return self.device_duration / self.occurences + + +Stats = namedtuple('Stats', [ + 'name', + 'id', + 'occurences', + 'operators', + 'host_duration', + 'self_host_duration', + 'device_duration', + 'self_device_duration', + 'avg_duration', + 'children']) + + +def aggegate_module_view(tid2tree: Dict[int, OperatorNode], events: List[BaseEvent]) -> Optional[List[Stats]]: + roots = _build_module_hierarchy(events) + modules = _get_node_list(tid2tree, ModuleNode) + if modules and roots: + return _process_module_statistics(modules, roots) + else: + return None + + +def aggegate_pl_module_view(tid2tree: Dict[int, OperatorNode], events: List[BaseEvent]) -> Optional[List[Stats]]: + roots = _build_module_hierarchy_from_name(events) + modules = _get_node_list(tid2tree, PLModuleNode) + if modules and roots: + return _process_module_statistics(modules, roots) + else: + return None + + +def _build_module_hierarchy_from_name(events: List[PLModuleEvent]) -> List[Module]: + pl_module_events = [e for e in events if e.type == EventTypes.PL_MODULE] + name2module: Dict[str, Module] = {} + no_root: Set[str] = set() + + for event in pl_module_events: + if event.name not in name2module: + name2module[event.name] = Module(event.name, 0) + + for name, module in name2module.items(): + if name.find('.') == -1: + continue + parent_name = name[:name.rfind('.')] + if parent_name in name2module: + name2module[parent_name].children.append(module) + no_root.add(module.name) + + return [module for name, module in name2module.items() if name not in no_root] + + +def _build_module_hierarchy(events: List[PythonFunctionEvent]) -> List[Module]: + """Get the module hierarchy from the chome trace events + """ + python_events = [e for e in events if e.type in (EventTypes.PYTHON_FUNCTION, EventTypes.MODULE)] + id_to_event = {e.python_id: e for e in python_events} + + # Extract Python function topology. + children: Dict[int, List[int]] = {} + for e in python_events: + e_id = e.python_id + children.setdefault(e_id, []) + e_parent_id = e.python_parent_id + children.setdefault(e_parent_id, []) + children[e_parent_id].append(e_id) + function_leaves = [k for k, v in children.items() if not v] + + # Convert Python function topology to Module topology. + # This is a simple O(n) tree walking algorithm where we start from the leaves + # and walk up, discarding any nodes which are not Module nodes. + module_parent_map = {} + seen = set() + for i in function_leaves: + e = id_to_event[i] + current_module = None + while e is not None: + e_id = e.python_id + if e.type == EventTypes.MODULE: + if current_module is not None: + module_parent_map[current_module.python_id] = e_id + current_module = e + module_parent_map.setdefault(e_id, None) + + seen_key = (e_id, id(current_module)) + if seen_key in seen: + break + seen.add(seen_key) + + e = id_to_event.get(e.python_parent_id, None) + + module_roots = [k for k, v in module_parent_map.items() if v is None] + module_child_map: Dict[int, List[int]] = {} + for child_id, parent_id in module_parent_map.items(): + module_child_map.setdefault(child_id, []) + module_child_map.setdefault(parent_id, []) + module_child_map[parent_id].append(child_id) + + # The traverse order is well defined which guarantees that a given topology + # will produce a unique and unambiguous hierarchy. + def append_hierarchy(e_id) -> Module: + e = id_to_event[e_id] + module = Module(e.name, e.module_id) + for id in module_child_map[e_id]: + child = append_hierarchy(id) + module.children.append(child) + return module + + unique_modules: Set[Module] = set() + for e_id in module_roots: + root = append_hierarchy(e_id) + unique_modules.add(root) + + return list(unique_modules) + + +def _aggregate_modules(modules: Iterable[Union[ModuleNode, PLModuleNode]]) -> Dict[Tuple[str, int], ModuleStats]: + """Aggregate the modules based on the name and module_id""" + module_aggs: Dict[Tuple(str, int), ModuleStats] = {} + for m in modules: + key = (m.name, m.module_id) + if key not in module_aggs: + module_aggs[key] = ModuleStats(m.name, m.module_id) + agg = module_aggs[key] + agg.occurences += 1 + + agg.operators += sum(is_operator_node(child) for child in m.children) + + agg.self_host_duration += m.self_host_duration + agg.host_duration += m.end_time - m.start_time + + agg.self_device_duration += m.self_device_duration + agg.device_duration += m.device_duration + + return module_aggs + + +def _get_node_list(tid2tree: Dict[int, OperatorNode], node_class) -> Generator[OperatorNode, None, None]: + """Get all node with node_class from the operator tree""" + def traverse_node(node): + # Check OptimizerNode here because in PytorchLightning PLModuleNode is under OptimizerNoder. + if type(node) not in (ProfilerStepNode, ModuleNode, OperatorNode, OptimizerNode, PLModuleNode, DataLoaderNode): + return + + if isinstance(node, node_class): + yield node + + for child in node.children: + yield from traverse_node(child) + + for _, root in tid2tree.items(): + for child in root.children: + yield from traverse_node(child) + + +def _process_module_statistics( + modules_nodes: Iterable[Union[ModuleNode, PLModuleNode]], + hierarchy: Iterable[Module]) -> List[Stats]: + """Get the module statistics from the ModuleNode(s) and the hierarchy + """ + module_aggs = _aggregate_modules(modules_nodes) + + def process_modules(h_modules: Iterable[Module]): + for m in h_modules: + name = m.name.replace('nn.Module: ', '') + stats = module_aggs[(m.name, m.module_id)] + + child_stats = list(process_modules(m.children)) + yield Stats( + name, + m.module_id, + stats.occurences, + stats.operators, + stats.host_duration, + stats.self_host_duration, + stats.device_duration, + stats.self_device_duration, + stats.avg_device_duration if stats.avg_device_duration > 0 else stats.avg_host_duration, + child_stats) + + data = sorted(process_modules(hierarchy), key=lambda x: x.name) + return data + + +def get_module_tree(tid2tree: Dict[int, OperatorNode]): + """Get the module tree in timeline""" + from copy import copy + + modules = [] + + def traverse_node(node, parent: Optional[ModuleNode]): + if type(node) not in (ProfilerStepNode, ModuleNode): + return + + if isinstance(node, ModuleNode): + module = copy(node) + # remove the children after copy to keep the module only + module.children = [] + + if parent is None: + modules.append(module) + else: + parent.children.append(module) + parent = module + + for child in node.children: + traverse_node(child, parent) + + for _, root in tid2tree.items(): + for child in root.children: + # since the root node is CallTreeRoot, there is no parent ModuleNode + traverse_node(child, None) + + return modules + + +def dump_modules(level: int, modules: Iterable[Union[Module, ModuleNode]]): + """testing purpose""" + for module in modules: + print(f"{' ' * level}{module.name.replace('nn.Module: ', '')}_{module.module_id}") + dump_modules(level + 1, module.children) diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/node.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/node.py new file mode 100644 index 0000000000000000000000000000000000000000..02718839874f66505d5c067ea6bfc8193c9798f9 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/node.py @@ -0,0 +1,316 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# ------------------------------------------------------------------------- +import sys +from abc import ABC +from typing import List, Optional, Tuple + +from .. import utils +from .tensor_core import TC_Allowlist, TC_OP_Allowlist +from .trace import (DurationEvent, EventTypes, KernelEvent, ModuleEvent, + OperatorEvent, PLProfileEvent) + +logger = utils.get_logger() + +ExcludeOpName = ['DataParallel.forward', 'DistributedDataParallel.forward'] + + +class BaseNode(ABC): + def __init__(self, name: str, start_time: int, end_time: int, type: str, tid: int, + external_id: Optional[int] = None): + self.name = name + self.start_time = start_time + self.end_time = end_time + self.type = type + self.tid = tid + self.external_id = external_id # For consistency check. + + @staticmethod + def get_node_argument(event: DurationEvent): + kwargs = {} + kwargs['name'] = event.name + kwargs['start_time'] = event.ts + kwargs['end_time'] = event.ts + event.duration + kwargs['type'] = event.type + kwargs['tid'] = event.tid + + external_id = getattr(event, 'external_id', None) + if external_id is not None: + kwargs['external_id'] = external_id + + return kwargs + + @property + def duration(self) -> int: + if self.start_time is not None and self.end_time is not None: + return self.end_time - self.start_time + else: + return 0 + + +class CommunicationNode(BaseNode): + def __init__(self, input_shape: List[List[int]], input_type: List[str], **kwargs): + super().__init__(**kwargs) + self.input_shape = input_shape + self.input_type = input_type + self.kernel_ranges: List[Tuple[int, int]] = [] + self.real_time_ranges: List[Tuple[int, int]] = [] + self.total_time: int = 0 + self.real_time: int = 0 + self.step_name: str = None + + @classmethod + def create(cls, event: OperatorEvent): + kwargs = BaseNode.get_node_argument(event) + return cls(input_shape=event.input_shape, input_type=event.input_type, **kwargs) + + +class HostNode(BaseNode): + def __init__(self, device_duration: int = 0, **kwargs): + super().__init__(**kwargs) + self.device_duration = device_duration # Total time of Kernel, GPU Memcpy, GPU Memset. TODO: parallel multi-stream? # noqa: E501 + + +class OperatorNode(HostNode): + # Don't use [] as default parameters + # https://stackoverflow.com/questions/1132941/least-astonishment-and-the-mutable-default-argument?page=1&tab=votes#tab-top + # https://web.archive.org/web/20200221224620/http://effbot.org/zone/default-values.htm + def __init__(self, children=None, runtimes=None, input_shape: Optional[List[List[int]]] = None, + input_type: Optional[List[str]] = None, callstack: Optional[str] = None, + self_host_duration: int = 0, self_device_duration: int = 0, **kwargs): + super().__init__(**kwargs) + self.children: List[OperatorNode] = [] if children is None else children # OperatorNode and ProfilerStepNode. + self.runtimes: List[RuntimeNode] = [] if runtimes is None else runtimes # RuntimeNode + self.input_shape = input_shape + self.input_type = input_type + self.callstack = callstack + self.self_host_duration = self_host_duration + self.self_device_duration = self_device_duration + # self.parent_node = None + self.tc_eligible = self.name in TC_OP_Allowlist + self.tc_self_duration = 0 # Time of TC kernels launched by this op excluding its children operators. + self.tc_total_duration = 0 # Time of TC kernels launched by this op including its children operators. + + def fill_stats(self): + # TODO: Replace recursive by using a stack, in case of too deep callstack. + self.children.sort(key=lambda x: (x.start_time, -x.end_time)) + self.runtimes.sort(key=lambda x: (x.start_time, -x.end_time) + if x.start_time and x.end_time else (sys.maxsize, -sys.maxsize - 1)) + + for child in self.children: + child.fill_stats() + for rt in self.runtimes: + rt.fill_stats(self) + + self.self_host_duration = self.end_time - self.start_time + for child in self.children: + self.device_duration += child.device_duration + self.self_host_duration -= (child.end_time - child.start_time) + self.tc_total_duration += child.tc_total_duration + # Mark TC eligible as True if any child operator is TC eligible. + if self.type == EventTypes.OPERATOR and not self.tc_eligible and child.tc_eligible: + self.tc_eligible = True + for rt in self.runtimes: + # From PyTorch 1.8 RC1, cpu_self_time does not include runtime's time. + # So here we keep consistent with it. + if rt.end_time is not None and rt.start_time is not None: + self.self_host_duration -= (rt.end_time - rt.start_time) + self.device_duration += rt.device_duration + self.self_device_duration += rt.device_duration + self.tc_self_duration += rt.tc_duration + self.tc_total_duration += rt.tc_duration + if self.type == EventTypes.OPERATOR and not self.tc_eligible and rt.tc_duration > 0: + logger.warning("New Tensor Cores eligible operator found: '{}'!".format(self.name)) + self.tc_eligible = True + + def get_operator_and_kernels(self): + ops: List[OperatorNode] = [] + kernels: List[DeviceNode] = [] + for child in self.children: + child_ops, child_kernels = child.get_operator_and_kernels() + ops.extend(child_ops) + kernels.extend(child_kernels) + for rt in self.runtimes: + kernels.extend(list(rt.get_kernels())) + + if is_operator_node(self): + ops.append(self) + + return ops, kernels + + @classmethod + def create(cls, event: OperatorEvent): + kwargs = BaseNode.get_node_argument(event) + return cls(input_shape=event.input_shape, input_type=event.input_type, callstack=event.callstack, **kwargs) + + +class ProfilerStepNode(OperatorNode): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + +class ModuleNode(OperatorNode): + def __init__(self, module_id: int, python_id: int, python_parent_id: int, **kwargs): + super().__init__(**kwargs) + self.module_id = module_id + self.python_id = python_id + self.python_parent_id = python_parent_id + + def fill_stats(self): + super().fill_stats() + self.self_device_duration += get_chilren_self_device_time(self) + + @classmethod + def create(cls, event: ModuleEvent): + kwargs = BaseNode.get_node_argument(event) + kwargs['module_id'] = event.module_id + kwargs['python_id'] = event.python_id + kwargs['python_parent_id'] = event.python_parent_id + # From the time being, the ModuleNode always have external_id to 0. + # As the result, we need reset the external_id to None to ignore adding the runtime nodes for ModuleNode + kwargs.pop('external_id', None) + return cls(**kwargs) + + +class BackwardNode(OperatorNode): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def fill_stats(self): + """Override the timestamps and duration for BackwardNode only + """ + self.children.sort(key=lambda x: (x.start_time, -x.end_time)) + self.start_time = self.children[0].start_time + self.end_time = self.children[-1].end_time + + self.self_host_duration = self.end_time - self.start_time + for child in self.children: + self.device_duration += child.device_duration + self.self_host_duration -= (child.end_time - child.start_time) + self.tc_total_duration += child.tc_total_duration + # Mark TC eligible as True if any child operator is TC eligible. + if not self.tc_eligible and child.tc_eligible: + self.tc_eligible = True + + +class PLProfileNode(OperatorNode): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + @classmethod + def create(cls, event: PLProfileEvent): + kwargs = BaseNode.get_node_argument(event) + return cls(**kwargs) + + +class PLModuleNode(OperatorNode): + def __init__(self, module_id: int, **kwargs): + super().__init__(**kwargs) + self.module_id = module_id + + def fill_stats(self): + super().fill_stats() + self.self_device_duration += get_chilren_self_device_time(self) + + @classmethod + def create(cls, event: PLProfileEvent): + kwargs = BaseNode.get_node_argument(event) + kwargs['module_id'] = event.module_id + return cls(**kwargs) + + +class DataLoaderNode(OperatorNode): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + +class OptimizerNode(OperatorNode): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + +class RuntimeNode(HostNode): + def __init__(self, device_nodes: Optional[List['DeviceNode']] = None, **kwargs): + super().__init__(**kwargs) + # One runtime could trigger more than one kernel, such as cudaLaunchCooperativeKernelMultiDevice. + self.device_nodes = sorted(device_nodes, key=lambda x: (x.start_time, -x.end_time)) if device_nodes else None + self.tc_duration: int = 0 # Time summarization of all its launched kernels. + + def fill_stats(self, op_node: OperatorNode = None): + if self.device_nodes: + for device_node in self.device_nodes: + if op_node: + device_node.op_name = op_node.name + device_node.op_tc_eligible = op_node.tc_eligible + device_duration = device_node.end_time - device_node.start_time + self.device_duration += device_duration + self.tc_duration += device_duration if device_node.tc_used else 0 + + def get_kernels(self): + if self.device_nodes: + for d in self.device_nodes: + if d.type == EventTypes.KERNEL: + yield d + + @classmethod + def create(cls, event, device_nodes: Optional[List['DeviceNode']]): + kwargs = BaseNode.get_node_argument(event) + return cls(device_nodes=device_nodes, **kwargs) + + +class DeviceNode(BaseNode): + def __init__(self, + blocks_per_sm: Optional[float] = None, + occupancy: int = None, + grid: Optional[List[int]] = None, + block: Optional[List[int]] = None, + regs_per_thread: int = None, + shared_memory: int = None, + device_id: int = None, **kwargs): + super().__init__(**kwargs) + self.op_tc_eligible = False + self.op_name = None + self.blocks_per_sm = blocks_per_sm + self.occupancy = occupancy + self.grid = grid + self.block = block + self.regs_per_thread = regs_per_thread + self.shared_memory = shared_memory + self.tc_used = self.name in TC_Allowlist + self.device_id = device_id + + @classmethod + def create(cls, event: KernelEvent): + kwargs = BaseNode.get_node_argument(event) + if event.type == EventTypes.KERNEL: + kwargs['blocks_per_sm'] = event.blocks_per_sm + kwargs['occupancy'] = event.occupancy + kwargs['grid'] = event.grid + kwargs['block'] = event.block + kwargs['regs_per_thread'] = event.regs_per_thread + kwargs['shared_memory'] = event.shared_memory + kwargs['device_id'] = event.device_id + return cls(**kwargs) + + +def create_operator_node(event: OperatorEvent): + if (event.name.startswith('enumerate(DataLoader)#') and event.name.endswith('.__next__') + or event.name.startswith('enumerate(DataPipe)#')): + return DataLoaderNode.create(event) + elif event.name.startswith('Optimizer.step'): + return OptimizerNode.create(event) + else: + return OperatorNode.create(event) + + +def is_operator_node(node: BaseNode): + return bool(type(node) is OperatorNode and node.type == EventTypes.OPERATOR and node.name not in ExcludeOpName + and not node.name.startswith("Optimizer.")) # exclude Optimizer.zero_grad + + +def get_chilren_self_device_time(node): + self_device_duration = 0 + for child in node.children: + if is_operator_node(child): + self_device_duration += child.device_duration + return self_device_duration diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/op_agg.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/op_agg.py new file mode 100644 index 0000000000000000000000000000000000000000..8a1af502f5c91147b1ef4e764e84385d7cc11af7 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/op_agg.py @@ -0,0 +1,162 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# -------------------------------------------------------------------------- +import sys +from collections import defaultdict +from typing import Callable, Dict, List + +from .. import utils +from .node import DeviceNode, OperatorNode + +logger = utils.get_logger() + + +class OperatorAgg: + def __init__(self, op: OperatorNode): + self.name = op.name + self.input_shape = str(op.input_shape) # Optional + + self.callstacks = set() # Optional + self.calls: int = 0 + self.host_duration: int = 0 + self.device_duration: int = 0 + self.self_host_duration: int = 0 + self.self_device_duration: int = 0 + self.tc_eligible = op.tc_eligible + self.tc_self_duration: int = 0 + self.tc_total_duration: int = 0 + # TODO: Think about adding these avgs to UI. + + @property + def tc_self_ratio(self) -> float: + return self.tc_self_duration / self.self_device_duration if self.self_device_duration > 0 else 0 + + @property + def tc_total_ratio(self) -> float: + return self.tc_total_duration / self.device_duration if self.device_duration > 0 else 0 + + +def aggregate_ops(op_list: List[OperatorNode], + keys_func: List[Callable[[OperatorNode], str]]) -> List[Dict[str, OperatorAgg]]: + def aggregate(key_to_agg: Dict[str, OperatorAgg], key: str, op: OperatorNode): + if key not in key_to_agg: + key_to_agg[key] = OperatorAgg(op) + agg = key_to_agg[key] + agg.callstacks.add(op.callstack) + agg.calls += 1 + agg.host_duration += op.duration + agg.device_duration += op.device_duration + agg.self_host_duration += op.self_host_duration + agg.self_device_duration += op.self_device_duration + agg.tc_self_duration += op.tc_self_duration + agg.tc_total_duration += op.tc_total_duration + return agg + + agg_dicts: List[Dict[str, OperatorAgg]] = [{} for _ in range(len(keys_func))] + for op in op_list: + for i, key_func in enumerate(keys_func): + key = key_func(op) + aggregate(agg_dicts[i], key, op) + + return agg_dicts + + +class KernelAggByNameOp: + def __init__(self, kernel: DeviceNode, op_name: str): + self.name = kernel.name + self.op_name = op_name + self.grid = kernel.grid + self.block = kernel.block + self.regs_per_thread = kernel.regs_per_thread + self.shared_memory = kernel.shared_memory + + self.calls: int = 0 + self.total_duration: int = 0 + self.min_duration: int = sys.maxsize + self.max_duration: int = 0 + self.blocks_per_sm = 0.0 + self.occupancy = 0.0 + self.tc_used = kernel.tc_used + self.op_tc_eligible = kernel.op_tc_eligible + + @property + def avg_duration(self): + return self.total_duration / self.calls + + @property + def avg_blocks_per_sm(self) -> float: + return self.blocks_per_sm / self.total_duration if self.total_duration > 0 else 0 + + @property + def avg_occupancy(self) -> float: + return self.occupancy / self.total_duration if self.total_duration > 0 else 0 + + +def aggregate_kernels(kernel_list: List[DeviceNode]) -> List[KernelAggByNameOp]: + name_op_to_agg: Dict[str, KernelAggByNameOp] = {} + for kernel in kernel_list: + dur = kernel.end_time - kernel.start_time + op_name = 'N/A' if kernel.op_name is None else kernel.op_name + key = '###'.join((kernel.name, op_name, + str(kernel.grid), str(kernel.block), + str(kernel.regs_per_thread or '0'), str(kernel.shared_memory or '0'))) + if key not in name_op_to_agg: + name_op_to_agg[key] = KernelAggByNameOp(kernel, op_name) + agg = name_op_to_agg[key] + agg.calls += 1 + agg.total_duration += dur + agg.min_duration = min(agg.min_duration, dur) + agg.max_duration = max(agg.max_duration, dur) + agg.blocks_per_sm += float(kernel.blocks_per_sm or 0) * dur + agg.occupancy += float(kernel.occupancy or 0) * dur + + kernel_list_groupby_name_op = list(name_op_to_agg.values()) + return kernel_list_groupby_name_op + + +class ModuleAggregator: + + def __init__(self): + self.op_list_groupby_name: List[OperatorAgg] = None # For Operator-view. + self.op_list_groupby_name_input: List[OperatorAgg] = None # For Operator-view. + self.kernel_list_groupby_name_op: List[KernelAggByNameOp] = None # For Kernel-view. + self.stack_lists_group_by_name: Dict[str, List[OperatorAgg]] = None + self.stack_lists_group_by_name_input: Dict[str, List[OperatorAgg]] = None + self.ops: List[OperatorNode] = None + + def aggregate(self, tid2tree: Dict[int, OperatorNode]): + # get the operators and kernels recursively by traverse the node tree root. + ops: List[OperatorNode] = [] + kernels: List[DeviceNode] = [] + for root in tid2tree.values(): + root_ops, root_kernels = root.get_operator_and_kernels() + ops.extend(root_ops) + kernels.extend(root_kernels) + + # aggregate both kernels and operators + self.kernel_list_groupby_name_op = aggregate_kernels(kernels) + + keys: List[Callable[[OperatorNode], str]] = [ + lambda x: x.name, + lambda x: '###'.join((x.name, str(x.input_shape))), + lambda x: '###'.join((x.name, str(x.callstack))), + lambda x: '###'.join((x.name, str(x.input_shape), str(x.callstack))) + ] + agg_result = aggregate_ops(ops, keys) + stack_lists_group_by_name: Dict[str, List[OperatorAgg]] = defaultdict(list) + stack_lists_group_by_name_input: Dict[str, List[OperatorAgg]] = defaultdict(list) + for agg in agg_result[2].values(): + assert (len(agg.callstacks) == 1) + if list(agg.callstacks)[0]: + stack_lists_group_by_name[agg.name].append(agg) + for agg in agg_result[3].values(): + assert (len(agg.callstacks) == 1) + if list(agg.callstacks)[0]: + key = agg.name + '###' + str(agg.input_shape) + stack_lists_group_by_name_input[key].append(agg) + + self.op_list_groupby_name = list(agg_result[0].values()) + self.op_list_groupby_name_input = list(agg_result[1].values()) + self.stack_lists_group_by_name = stack_lists_group_by_name + self.stack_lists_group_by_name_input = stack_lists_group_by_name_input + self.ops = ops diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/op_tree.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/op_tree.py new file mode 100644 index 0000000000000000000000000000000000000000..bee6687e8bd06bdcc936e5273dc81a240a586806 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/op_tree.py @@ -0,0 +1,351 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# ------------------------------------------------------------------------- +import sys +from collections import defaultdict +from typing import Dict, Iterable, List, Optional, Tuple + +from .. import utils +from .node import (BackwardNode, DeviceNode, ModuleNode, OperatorNode, + ProfilerStepNode, RuntimeNode, is_operator_node) +from .trace import EventTypes + +logger = utils.get_logger() + + +class OpTreeBuilder: + BACKWARD_ROOT_PREFIX = 'autograd::engine::evaluate_function:' + BACKWARD_ACCUMULATE_GRAD = 'autograd::engine::evaluate_function: torch::autograd::AccumulateGrad' + + def __init__(self): + self.main_tid: int = None + self.tid2tree: Dict[int, OperatorNode] = None + + def build_tree(self, + tid2list: Dict[int, List[OperatorNode]], + tid2zero_rt_list: Dict[int, List[RuntimeNode]], + staled_device_nodes: List[DeviceNode], + fwd_bwd_map: Dict[int, int]): + """Construct the BackwardNode and replace the original backward nodes + """ + self.tid2tree = self._build_tree(tid2list, tid2zero_rt_list, staled_device_nodes) + + # if could not find any forward/backward association, skip the processing + if not fwd_bwd_map: + logger.debug('there is no any forwarwd backward association, skip processing backward correlation.') + return self.tid2tree + + self._set_main_tid() + + modules, backward_nodes = self._get_modules() + if not modules or not backward_nodes: + return self.tid2tree + + _, ts2parent = OpTreeBuilder._get_node_parents(backward_nodes) + agg_nodes = OpTreeBuilder._group_backward_nodes(backward_nodes) + fwd_bwd_root = self._get_backward_roots(fwd_bwd_map, ts2parent, agg_nodes) + if len(agg_nodes) > 0: + logger.warning('some nodes cannot find forward nodes') + + backward_modules: List[BackwardNode] = [] + for module in modules: + OpTreeBuilder._build_backward_module(module, None, fwd_bwd_root, backward_modules) + OpTreeBuilder._insert_backward_modules(self.tid2tree[self.main_tid], backward_modules) + self.tid2tree = {tid: root for tid, root in self.tid2tree.items() if len(root.children) > 0} + + return self.tid2tree + + def _build_tree(self, tid2list: Dict[int, List[OperatorNode]], tid2zero_rt_list, staled_device_nodes): + tid2tree = {} + + for tid, op_list in tid2list.items(): + zero_rt_list = tid2zero_rt_list[tid] if tid in tid2zero_rt_list else [] + # Note that when 2 start_time are equal, the one with bigger end_time should be ahead of the other. + op_list.sort(key=lambda x: (x.start_time, -x.end_time)) + main_tid = any([op.name.startswith('ProfilerStep#') for op in op_list]) + if main_tid: + # only append the staled device nodes into main thread + self.main_tid = op_list[0].tid + root_node = self._build_tree_internal(op_list, zero_rt_list, tid, staled_device_nodes) + else: + root_node = self._build_tree_internal(op_list, zero_rt_list, tid, []) + tid2tree[int(tid)] = root_node + + return tid2tree + + def _set_main_tid(self): + if self.main_tid is None and self.tid2tree: + if len(self.tid2tree) == 1: + self.main_tid = next(iter(self.tid2tree)) + else: + # there are multiple tids + backward_tid = self._find_backward_tid() + tid2len = { + tid: root.end_time - root.start_time for tid, root in self.tid2tree.items() + if tid != backward_tid or backward_tid is None + } + # get the maximum length as the main thread + self.main_tid = max(tid2len, key=tid2len.get) + + def _find_backward_tid(self): + for root in self.tid2tree.values(): + for child in root.children: + if child.name.startswith(OpTreeBuilder.BACKWARD_ROOT_PREFIX): + return child.tid + + return None + + def _build_tree_internal(self, host_node_list, zero_rt_list, tid, staled_device_nodes): + """host_node_list: list of OperatorNode and ProfilerStepNode. + zero_rt_list: list of RuntimeNode with external_id=0.""" + + def build_tree_relationship(host_node_list: Iterable[OperatorNode], zero_rt_list, staled_device_nodes): + dummpy_rt: List[RuntimeNode] = [] + if staled_device_nodes: + # Note: Although kernels of this dummy runtime is put under main thread's tree, + # we don't know which thread launches them. + # TODO: Don't make belonging thread assumption on future usage if we need special handling + dummpy_rt.append(RuntimeNode( + name='dummy', + start_time=None, + end_time=None, + type=EventTypes.RUNTIME, + tid=0, + device_nodes=staled_device_nodes)) + dummpy_rt[0].fill_stats() + node_stack: List[OperatorNode] = [] + root_node = OperatorNode( + name='CallTreeRoot', + start_time=-sys.maxsize - 1, + end_time=sys.maxsize, + type=EventTypes.PYTHON, + tid=tid, + runtimes=zero_rt_list + dummpy_rt) # Give the list of RuntimeNode with external_id=0 to root node. + node_stack.append(root_node) + for node in host_node_list: + while True: # break loop when the node is inserted. + tail_node = node_stack[-1] + if node.start_time < tail_node.end_time: + if node.end_time <= tail_node.end_time: + tail_node.children.append(node) + # node.parent_node = weakref.ref(tail_node) + node_stack.append(node) + else: + logger.error('Error in input data: ranges on the same thread should not intersect!' + 'Father:({},{},{}) Child:({},{},{})' + .format(tail_node.name, tail_node.start_time, tail_node.end_time, + node.name, node.start_time, node.end_time)) + break + else: + node_stack.pop() + return root_node + + # Merge the consecutive calls to same function into one. + # Just follow the same pattern in torch/autograd/profiler.py, + # EventList._remove_dup_nodes + # TODO: Replace recursive by for loop, in case of too deep callstack. + def remove_dup_nodes(node: OperatorNode): + if node.type == EventTypes.RUNTIME: + return + if len(node.children) == 1: + child = node.children[0] + if node.name == child.name and node.type == EventTypes.OPERATOR and child.type == EventTypes.OPERATOR: + node.children = child.children + node.runtimes = child.runtimes # Keep consistent with autograd profiler. + remove_dup_nodes(node) # This node may have to merge with child's child. + return + + for child in node.children: + remove_dup_nodes(child) + + root_node = build_tree_relationship(host_node_list, zero_rt_list, staled_device_nodes) + remove_dup_nodes(root_node) + root_node.fill_stats() + + # replace the root_node start_time/end_time + root_node.start_time = next((child.start_time for child in root_node.children + if child.start_time is not None), None) + root_node.end_time = next((child.end_time for child in reversed(root_node.children) + if child.end_time is not None), None) + return root_node + + def _get_modules(self) -> Tuple[List[ModuleNode], List[OperatorNode]]: + """Get the ModuleNodes and backward root nodes + If there are any ModuleNodes, the backward roots will be removed from the tree + so that later a new BackwardNode will be replaced. + """ + modules: List[ModuleNode] = [] + backward_nodes: Dict[OperatorNode, List[OperatorNode]] = defaultdict(list) + + def traverse_node(parent, node: OperatorNode): + if isinstance(node, ModuleNode): + modules.append(node) + elif isinstance(node, ProfilerStepNode): + for child in node.children: + traverse_node(node, child) + else: + if node.name.startswith(OpTreeBuilder.BACKWARD_ROOT_PREFIX): + backward_nodes[parent].append(node) + else: + pass + + for root in self.tid2tree.values(): + for child in root.children: + traverse_node(root, child) + + if modules: + backward_nodes_flatten: List[OperatorNode] = [] + # only remove the backward nodes when the module information exist + for p, nodes in backward_nodes.items(): + p.children = [child for child in p.children if child not in nodes] + backward_nodes_flatten.extend(nodes) + + return modules, backward_nodes_flatten + else: + return None, None + + @staticmethod + def _get_node_parents(nodes: Iterable[OperatorNode]): + """Get the child->parent relationship for these nodes""" + ts_to_node: Dict[int, OperatorNode] = {} + ts_to_parent: Dict[int, OperatorNode] = {} + + def traverse_node(node: OperatorNode): + if node.start_time not in ts_to_node: + ts_to_node[node.start_time] = node + for child in node.children: + if child.start_time not in ts_to_parent: + ts_to_parent[child.start_time] = node + traverse_node(child) + + for node in nodes: + traverse_node(node) + return ts_to_node, ts_to_parent + + @staticmethod + def _group_backward_nodes(nodes: Iterable[OperatorNode]) -> Dict[OperatorNode, List[OperatorNode]]: + """All nodes are backward nodes startswith autograd::engine::evaluate_function. + If one node's name is autograd::engine::evaluate_function: torch::autograd::AccumulateGrad, + it should be grouped with previous normal backward node. Otherwise, a new backward node should be started + """ + grouped_bwd_nodes: List[List[OperatorNode]] = [] + for node in nodes: + if node.name == OpTreeBuilder.BACKWARD_ACCUMULATE_GRAD: + grouped_bwd_nodes[-1].append(node) + else: + grouped_bwd_nodes.append([node]) + + # return the root backward node -> aggregated backward nodes array + # if there is no any AccumulateGrad accompanied with it, then the key:value is itself. + return {nodes[0]: nodes for nodes in grouped_bwd_nodes} + + @staticmethod + def _get_backward_roots(fwd_bwd_map: Dict[int, int], + ts2parent: Dict[int, OperatorNode], + backward_nodes: Dict[OperatorNode, List[OperatorNode]]) -> Dict[int, List[OperatorNode]]: + if not fwd_bwd_map: + return None + + fwd_to_bwdroot: Dict[int, List[OperatorNode]] = {} + for fwd, bwd in fwd_bwd_map.items(): + parent = ts2parent.get(bwd) + while parent is not None and not parent.name.startswith(OpTreeBuilder.BACKWARD_ROOT_PREFIX): + parent = ts2parent.get(parent.start_time) + + if parent: + fwd_to_bwdroot[fwd] = backward_nodes.pop(parent) + else: + logger.warning('parent is None for', bwd) + + return fwd_to_bwdroot + + def _build_backward_module(node: ModuleNode, + parent: Optional[BackwardNode], + fwd_bwd_map: Dict[int, List[OperatorNode]], + result: List[BackwardNode]): + """Construct the backward module from root (node argument) and + insert it into result array if there is no any parent associated with it. + """ + if not fwd_bwd_map: + logger.warning('The forward backward map is empty. The backward construction is skipped.') + return + + if isinstance(node, ModuleNode): + backward_node = BackwardNode(name=node.name + '.backward', start_time=None, end_time=None, + type='backward', tid=0) + if parent is None: + result.append(backward_node) + else: + parent.children.append(backward_node) + parent = backward_node + + for child in node.children: + if parent: + if is_operator_node(child): + bwd_ops = fwd_bwd_map.get(child.start_time) + if bwd_ops: + parent.children.extend(bwd_ops) + + OpTreeBuilder._build_backward_module(child, parent, fwd_bwd_map, result) + + if isinstance(node, ModuleNode) and parent and parent.children: + parent.fill_stats() + parent.tid = parent.children[0].tid + + @staticmethod + def _insert_backward_modules(root: OperatorNode, backward_modules: List[BackwardNode]): + backward_modules.sort(key=lambda x: (x.start_time, -x.end_time)) + + # each item is (parent_node, child_index) that it is visiting. + node_stack = [] + module_index = 0 + child_index = 0 + current_node = root + + staled_modules = [] + + while module_index < len(backward_modules): + module = backward_modules[module_index] + if current_node is None: + # ignore all remaining modules + staled_modules.append(module) + module_index += 1 + continue + + if module.end_time < current_node.start_time: + staled_modules.append(module) + module_index += 1 + continue + elif module.start_time > current_node.end_time: + if node_stack: + # pop parent node and update the child_index accordingly. + current_node, child_index = node_stack.pop() + child_index += 1 + else: + # if there is not item in stack, set it to None + current_node = None + continue + + while child_index < len(current_node.children): + if module.end_time < current_node.children[child_index].start_time: + # if current module is before next child, + # we will break the search and keep the child_index not change. + # As the result, the module will be treated as child of 'current_node' + # So that next time we can continue from here. + # there is no any child contains the record.timestamp + # child_find is False at this case. + break + elif module.start_time >= current_node.children[child_index].end_time: + child_index += 1 + else: + # current children contains the record + node_stack.append((current_node, child_index)) + current_node = current_node.children[child_index] + child_index = 0 + + # when code execute here, it means the current_node will be the parent of backward module + # Add the module into current_node + current_node.children.insert(child_index, module) + # since the children number is increased by 1, we need increment the child_index. + child_index += 1 + module_index += 1 diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/overall_parser.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/overall_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..df1f29927e84d7321e6cb073d7d08b02c5febe7d --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/overall_parser.py @@ -0,0 +1,110 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# -------------------------------------------------------------------------- +from typing import List, Tuple + +from .. import utils +from .event_parser import ProfileRole +from .range_utils import (get_ranges_sum, intersection_ranges_lists, + merge_ranges, subtract_ranges_lists) + +logger = utils.get_logger() + + +class OverallParser(object): + class Costs: + def __init__(self, costs: List[float] = None): + # the cost length is len(ProfileRole) + if costs is None: + self.costs = [0.] * len(ProfileRole) + else: + self.costs = costs + + @classmethod + def create_from_statistics(cls, statistics: 'OverallParser.Statistics', total_duration: int): + costs = [0.] * len(ProfileRole) + for i in range(len(statistics.cost_ranges)): + costs[i] = get_ranges_sum(statistics.cost_ranges[i]) + costs[ProfileRole.Total] = total_duration + return cls(costs) + + class Statistics: + def __init__(self, cost_ranges: List[List[Tuple[int, int]]]): + if not cost_ranges: + raise ValueError('the cost ranges is None') + + self.cost_ranges = cost_ranges + + @classmethod + def create_from_range(cls, steps: List[Tuple[int, int]], role_ranges: List[List[Tuple[int, int]]]): + assert len(role_ranges) == ProfileRole.Total - 1 + + cost_ranges: List[List[Tuple[int, int]]] = [] + slots: List[Tuple[int, int]] = [] + for role in role_ranges: + if slots: + range = intersection_ranges_lists(slots, role) + else: + range = role + slots = merge_ranges(list(steps)) + cost_ranges.append(range) + slots = subtract_ranges_lists(slots, range) + # The last one is ProfileRole.Other + cost_ranges.append(slots) + + return cls(cost_ranges) + + def intersection_with_step(self, step: Tuple[int, int]): + cost_ranges: List[List[Tuple[int, int]]] = [] + step = [step] + for range in self.cost_ranges: + cost_ranges.append(intersection_ranges_lists(step, range)) + + return OverallParser.Statistics(cost_ranges) + + class StepCommunicationCosts: + def __init__(self): + self.computation: int = 0 + self.communication: int = 0 + self.overlap: int = 0 + self.other: int = 0 + + def __init__(self): + self.steps_costs: List[OverallParser.Costs] = [] + self.avg_costs = OverallParser.Costs() + self.communication_overlap: List[OverallParser.StepCommunicationCosts] = [] + + def aggregate(self, steps: List[Tuple[int, int]], role_ranges: List[List[Tuple[int, int]]]): + logger.debug('Overall, statistics') + global_stats = OverallParser.Statistics.create_from_range(steps, role_ranges) + if role_ranges[ProfileRole.Kernel]: + comm_comp_overlap = intersection_ranges_lists( + role_ranges[ProfileRole.Kernel], role_ranges[ProfileRole.Communication]) + else: + comm_comp_overlap = intersection_ranges_lists( + role_ranges[ProfileRole.CpuOp], role_ranges[ProfileRole.Communication]) + + logger.debug('Overall, aggregation') + for i, step in enumerate(steps): + steps_stat = global_stats.intersection_with_step(step) + self.steps_costs.append(OverallParser.Costs.create_from_statistics(steps_stat, step[1] - step[0])) + for cost_index in range(len(self.avg_costs.costs)): + self.avg_costs.costs[cost_index] += self.steps_costs[i].costs[cost_index] + + comm_costs = OverallParser.StepCommunicationCosts() + comm_costs.overlap = get_ranges_sum(intersection_ranges_lists([step], comm_comp_overlap)) + if role_ranges[ProfileRole.Kernel]: + comm_costs.computation = get_ranges_sum( + intersection_ranges_lists([step], role_ranges[ProfileRole.Kernel])) + else: + comm_costs.computation = get_ranges_sum( + intersection_ranges_lists([step], role_ranges[ProfileRole.CpuOp])) + comm_costs.communication = get_ranges_sum( + intersection_ranges_lists([step], role_ranges[ProfileRole.Communication])) + comm_costs.other = self.steps_costs[i].costs[ProfileRole.Total] +\ + comm_costs.overlap - comm_costs.computation - comm_costs.communication + self.communication_overlap.append(comm_costs) + + valid_steps = len(steps) + for i in range(len(self.avg_costs.costs)): + self.avg_costs.costs[i] /= valid_steps diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/range_utils.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/range_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..c927d5acbe658f9d80f4a251635b61cb05bee1ef --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/range_utils.py @@ -0,0 +1,190 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# ------------------------------------------------------------------------- +from typing import List, Tuple + + +# src_ranges: item of (start_time, end_time, value) +def merge_ranges_with_value(src_ranges): + from collections import namedtuple + from enum import IntEnum + + class EndpointTypes(IntEnum): + START = 0 + END = 1 + + EndPoint = namedtuple('EndPoint', ['time', 'pt_type', 'value']) + + merged_ranges = [] + if len(src_ranges) > 0: + # Build tuple of (time, type, value) + endpoints: List[EndPoint] = [] + for r in src_ranges: + endpoints.append(EndPoint(r[0], EndpointTypes.START, r[2])) + endpoints.append(EndPoint(r[1], EndpointTypes.END, r[2])) + endpoints.sort(key=lambda x: [x.time, int(x.pt_type)]) # Make START in front of END if equal on time. + + last_endpoint_time = endpoints[0].time + last_value = endpoints[0].value + for i in range(1, len(endpoints)): + ep = endpoints[i] + if ep.time > last_endpoint_time and last_value > 0.0: + approximated_sm_efficiency = min(last_value, 1.0) + merged_ranges.append((last_endpoint_time, ep.time, approximated_sm_efficiency)) + last_endpoint_time = ep.time + if ep.pt_type == EndpointTypes.START: + last_value += ep.value + else: + last_value -= ep.value + + return merged_ranges + + +# range_list1 item is length 3. range_list2 item is length 2. +# Reture value's item is length 3. +def intersection_ranges_lists_with_value(range_list1, range_list2) -> List[Tuple[int, int, int]]: + range_list_dst = [] + if len(range_list1) == 0 or len(range_list2) == 0: + return range_list_dst + r1 = range_list1[0] + r2 = range_list2[0] + i1 = i2 = 0 + while i1 < len(range_list1): + if i2 == len(range_list2): + break + elif r2[1] <= r1[0]: + r2, i2 = pop_list(range_list2, i2) + elif r2[0] <= r1[0] and r2[1] < r1[1]: + assert (r2[1] > r1[0]) + range_list_dst.append((r1[0], r2[1], r1[2])) + r1 = (r2[1], r1[1], r1[2]) + r2, i2 = pop_list(range_list2, i2) + elif r2[0] <= r1[0]: + assert (r2[1] >= r1[1]) + range_list_dst.append(r1) + r2 = (r1[1], r2[1]) + r1, i1 = pop_list(range_list1, i1) + elif r2[1] < r1[1]: + assert (r2[0] > r1[0]) + range_list_dst.append((r2[0], r2[1], r1[2])) + r1 = (r2[1], r1[1], r1[2]) + r2, i2 = pop_list(range_list2, i2) + elif r2[0] < r1[1]: + assert (r2[1] >= r1[1]) + range_list_dst.append((r2[0], r1[1], r1[2])) + r2 = (r1[1], r2[1]) + r1, i1 = pop_list(range_list1, i1) + else: + assert (r2[0] >= r1[1]) + r1, i1 = pop_list(range_list1, i1) + return range_list_dst + + +def subtract_ranges_lists(range_list1: List[Tuple[int, int]], + range_list2: List[Tuple[int, int]]) -> List[Tuple[int, int]]: + range_list_dst = [] + if len(range_list1) == 0: + return range_list_dst + if len(range_list2) == 0: + range_list_dst = list(range_list1) + return range_list_dst + r1 = range_list1[0] + r2 = range_list2[0] + i1 = i2 = 0 + while i1 < len(range_list1): + if i2 == len(range_list2): + range_list_dst.append(r1) + r1, i1 = pop_list(range_list1, i1) + elif r2[1] <= r1[0]: + r2, i2 = pop_list(range_list2, i2) + elif r2[0] <= r1[0] and r2[1] < r1[1]: + r1 = (r2[1], r1[1]) + r2, i2 = pop_list(range_list2, i2) + elif r2[0] <= r1[0]: + assert (r2[1] >= r1[1]) + r2 = (r1[1], r2[1]) + r1, i1 = pop_list(range_list1, i1) + elif r2[0] < r1[1]: + assert (r2[0] > r1[0]) + range_list_dst.append((r1[0], r2[0])) + r1 = (r2[0], r1[1]) + else: + assert (r2[0] >= r1[1]) + range_list_dst.append(r1) + r1, i1 = pop_list(range_list1, i1) + return range_list_dst + + +def intersection_ranges_lists(range_list1: List[Tuple[int, int]], + range_list2: List[Tuple[int, int]]) -> List[Tuple[int, int]]: + range_list_dst = [] + if len(range_list1) == 0 or len(range_list2) == 0: + return range_list_dst + r1 = range_list1[0] + r2 = range_list2[0] + i1 = i2 = 0 + while i1 < len(range_list1): + if i2 == len(range_list2): + break + elif r2[1] <= r1[0]: + r2, i2 = pop_list(range_list2, i2) + elif r2[0] <= r1[0] and r2[1] < r1[1]: + assert (r2[1] > r1[0]) + range_list_dst.append((r1[0], r2[1])) + r1 = (r2[1], r1[1]) + r2, i2 = pop_list(range_list2, i2) + elif r2[0] <= r1[0]: + assert (r2[1] >= r1[1]) + range_list_dst.append(r1) + r2 = (r1[1], r2[1]) + r1, i1 = pop_list(range_list1, i1) + elif r2[1] < r1[1]: + assert (r2[0] > r1[0]) + range_list_dst.append(r2) + r1 = (r2[1], r1[1]) + r2, i2 = pop_list(range_list2, i2) + elif r2[0] < r1[1]: + assert (r2[1] >= r1[1]) + range_list_dst.append((r2[0], r1[1])) + r2 = (r1[1], r2[1]) + r1, i1 = pop_list(range_list1, i1) + else: + assert (r2[0] >= r1[1]) + r1, i1 = pop_list(range_list1, i1) + return range_list_dst + + +def get_ranges_sum(ranges: List[Tuple[int, int]]) -> int: + sum: int = 0 + for range in ranges: + sum += (range[1] - range[0]) + return sum + + +def pop_list(range_list, index): + next_index = index + 1 + if next_index >= len(range_list): + return None, len(range_list) + next_item = range_list[next_index] + return next_item, next_index + + +def merge_ranges(src_ranges, is_sorted=False) -> List[Tuple[int, int]]: + if not src_ranges: + # return empty list if src_ranges is None or its length is zero. + return [] + + if not is_sorted: + src_ranges.sort(key=lambda x: x[0]) + + merged_ranges = [] + merged_ranges.append(src_ranges[0]) + for src_id in range(1, len(src_ranges)): + src_range = src_ranges[src_id] + if src_range[1] > merged_ranges[-1][1]: + if src_range[0] <= merged_ranges[-1][1]: + merged_ranges[-1] = (merged_ranges[-1][0], src_range[1]) + else: + merged_ranges.append((src_range[0], src_range[1])) + + return merged_ranges diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/run_generator.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/run_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..8305a4c44b1bfa75035db78082944b2d84bfe883 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/run_generator.py @@ -0,0 +1,572 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# -------------------------------------------------------------------------- +from collections import OrderedDict +from typing import Dict, Iterable, List + +from .. import consts, utils +from ..run import DistributedRunProfile, RunProfile +from .data import DistributedRunProfileData, RunProfileData +from .module_op import aggegate_module_view, aggegate_pl_module_view +from .op_agg import KernelAggByNameOp, OperatorAgg +from .overall_parser import ProfileRole + +logger = utils.get_logger() + + +class RunGenerator(object): + def __init__(self, worker, span, profile_data: RunProfileData): + self.worker = worker + self.span = span + self.profile_data = profile_data + + def generate_run_profile(self): + profile_run = RunProfile(self.worker, self.span) + profile_run.is_pytorch_lightning = self.profile_data.is_pytorch_lightning + profile_run.has_runtime = self.profile_data.has_runtime + profile_run.has_kernel = self.profile_data.has_kernel + profile_run.has_communication = self.profile_data.has_communication + profile_run.has_memcpy_or_memset = self.profile_data.has_memcpy_or_memset + profile_run.profiler_start_ts = self.profile_data.profiler_start_ts + profile_run.views.append(consts.OVERALL_VIEW) + profile_run.overview = self._generate_overview() + + profile_run.views.append(consts.OP_VIEW) + profile_run.operation_pie_by_name = self._generate_op_pie() + profile_run.operation_table_by_name = self._generate_op_table(self.profile_data.op_list_groupby_name) + profile_run.operation_stack_by_name = self._generate_op_table_for_stack(False) + profile_run.operation_pie_by_name_input = self._generate_op_pie(True) + profile_run.operation_table_by_name_input = self._generate_op_table( + self.profile_data.op_list_groupby_name_input, True) + profile_run.operation_stack_by_name_input = self._generate_op_table_for_stack(True) + + if self.profile_data.has_kernel: + profile_run.views.append(consts.KERNEL_VIEW) + profile_run.kernel_op_table = self._generate_kernel_op_table() + profile_run.kernel_pie = self._generate_kernel_pie() + profile_run.kernel_table = self._generate_kernel_table() + profile_run.tc_pie = self._generate_tc_pie() + + profile_run.views.append(consts.TRACE_VIEW) + profile_run.trace_file_path = self.profile_data.trace_file_path + + profile_run.gpu_metrics = self.profile_data.gpu_metrics_parser.get_gpu_metrics() + + gpu_infos = {gpu_id: RunGenerator._get_gpu_info(self.profile_data.device_props, gpu_id) + for gpu_id in self.profile_data.gpu_metrics_parser.gpu_ids} + gpu_infos = {gpu_id: gpu_info for gpu_id, gpu_info in gpu_infos.items() if gpu_info is not None} + + profile_run.gpu_summary, profile_run.gpu_tooltip = \ + self.profile_data.gpu_metrics_parser.get_gpu_metrics_data_tooltip( + gpu_infos, self.profile_data.tc_ratio) + + profile_run.tid2tree = self.profile_data.tid2tree + profile_run.pl_tid2tree = self.profile_data.pl_tid2tree + + if self.profile_data.memory_snapshot: + profile_run.views.append(consts.MEMORY_VIEW) + profile_run.memory_snapshot = self.profile_data.memory_snapshot + + profile_run.module_stats = aggegate_module_view(self.profile_data.tid2tree, self.profile_data.events) + profile_run.pl_module_stats = aggegate_pl_module_view(self.profile_data.tid2tree, self.profile_data.events) + if profile_run.is_pytorch_lightning and profile_run.pl_module_stats: + profile_run.views.append(consts.LIGHTNING_VIEW) + elif profile_run.module_stats: + profile_run.views.append(consts.MODULE_VIEW) + + return profile_run + + def _generate_overview(self): + def build_part_time_str(part_cost: float, part_name: str): + format_str = ('
' + 'Step {}
' + 'Total: {}us
' + '{}: {}us
' + 'Percentage: {}%' + '
') + percentage = round(100 * part_cost / costs.costs[ProfileRole.Total], 2) + return format_str.format(step_name, costs.costs[ProfileRole.Total], part_name, part_cost, percentage) + + def build_avg_cost_dict(part_name: str, part_cost: float): + cost_dict = {'name': part_name, + 'description': '', + 'value': round(part_cost), + 'extra': round(100 * part_cost / self.profile_data.avg_costs.costs[ProfileRole.Total], 2)} + return cost_dict + + show_gpu = (self.profile_data.has_runtime + or self.profile_data.has_kernel or self.profile_data.has_memcpy_or_memset) + + column_tootip = {'type': 'string', 'role': 'tooltip', 'p': {'html': 'true'}} + data = {} + data['steps'] = {} + data['steps']['columns'] = [{'type': 'string', 'name': 'Step'}] + if show_gpu: + data['steps']['columns'].extend([{'type': 'number', 'name': 'Kernel'}, + column_tootip, + {'type': 'number', 'name': 'Memcpy'}, + column_tootip, + {'type': 'number', 'name': 'Memset'}, + column_tootip]) + if self.profile_data.has_communication: + data['steps']['columns'].extend([{'type': 'number', 'name': 'Communication'}, + column_tootip]) + if show_gpu: + data['steps']['columns'].extend([{'type': 'number', 'name': 'Runtime'}, + column_tootip]) + data['steps']['columns'].extend([{'type': 'number', 'name': 'DataLoader'}, + column_tootip, + {'type': 'number', 'name': 'CPU Exec'}, + column_tootip, + {'type': 'number', 'name': 'Other'}, + column_tootip]) + + data['steps']['rows'] = [] + for i in range(len(self.profile_data.steps_costs)): + costs = self.profile_data.steps_costs[i] + step_name = self.profile_data.steps_names[i] + row = [step_name] + if show_gpu: + row.extend([costs.costs[ProfileRole.Kernel], + build_part_time_str(costs.costs[ProfileRole.Kernel], 'Kernel'), + costs.costs[ProfileRole.Memcpy], + build_part_time_str(costs.costs[ProfileRole.Memcpy], 'Memcpy'), + costs.costs[ProfileRole.Memset], + build_part_time_str(costs.costs[ProfileRole.Memset], 'Memset')]) + if self.profile_data.has_communication: + row.extend([costs.costs[ProfileRole.Communication], + build_part_time_str(costs.costs[ProfileRole.Communication], 'Communication')]) + if show_gpu: + row.extend([costs.costs[ProfileRole.Runtime], + build_part_time_str(costs.costs[ProfileRole.Runtime], 'Runtime')]) + row.extend([costs.costs[ProfileRole.DataLoader], + build_part_time_str(costs.costs[ProfileRole.DataLoader], 'DataLoader'), + costs.costs[ProfileRole.CpuOp], + build_part_time_str(costs.costs[ProfileRole.CpuOp], 'CPU Exec'), + costs.costs[ProfileRole.Other], + build_part_time_str(costs.costs[ProfileRole.Other], 'Other')]) + data['steps']['rows'].append(row) + + avg_costs = [] + if show_gpu: + avg_costs.extend([ + build_avg_cost_dict('Kernel', self.profile_data.avg_costs.costs[ProfileRole.Kernel]), + build_avg_cost_dict('Memcpy', self.profile_data.avg_costs.costs[ProfileRole.Memcpy]), + build_avg_cost_dict('Memset', self.profile_data.avg_costs.costs[ProfileRole.Memset]) + ]) + if self.profile_data.has_communication: + avg_costs.extend([ + build_avg_cost_dict('Communication', self.profile_data.avg_costs.costs[ProfileRole.Communication]) + ]) + if show_gpu: + avg_costs.extend([ + build_avg_cost_dict('Runtime', self.profile_data.avg_costs.costs[ProfileRole.Runtime]) + ]) + avg_costs.extend([ + build_avg_cost_dict('DataLoader', self.profile_data.avg_costs.costs[ProfileRole.DataLoader]), + build_avg_cost_dict('CPU Exec', self.profile_data.avg_costs.costs[ProfileRole.CpuOp]), + build_avg_cost_dict('Other', self.profile_data.avg_costs.costs[ProfileRole.Other]) + ]) + + data['performance'] = [{'name': 'Average Step Time', 'description': '', + 'value': round(self.profile_data.avg_costs.costs[ProfileRole.Total]), + 'extra': 100, 'children': avg_costs}] + + if len(self.profile_data.recommendations) == 0: + html = '
  • N/A
  • ' + else: + html = '' + for recommendation in self.profile_data.recommendations: + html += '
  • {}
  • '.format(recommendation) + data['recommendations'] = '
      {}
    '.format(html) + + return data + + def _generate_op_pie(self, group_by_input_shape: bool = False): + op_device_total_time = [] + op_device_self_time = [] + op_host_total_time = [] + op_host_self_time = [] + + if group_by_input_shape: + op_list = self.profile_data.op_list_groupby_name_input + else: + op_list = self.profile_data.op_list_groupby_name + + for op_agg in op_list: + # Whether device_duration & self_device_duration are accurate or not depends on the input tracing data. + if op_agg.device_duration > 0: + op_device_total_time.append([op_agg.name, op_agg.device_duration]) + if op_agg.self_device_duration > 0: + op_device_self_time.append([op_agg.name, op_agg.self_device_duration]) + if op_agg.host_duration > 0: + op_host_total_time.append([op_agg.name, op_agg.host_duration]) + if op_agg.self_host_duration > 0: + op_host_self_time.append([op_agg.name, op_agg.self_host_duration]) + + op_device_total_time.sort(key=lambda x: x[1], reverse=True) + op_device_self_time.sort(key=lambda x: x[1], reverse=True) + op_host_total_time.sort(key=lambda x: x[1], reverse=True) + op_host_self_time.sort(key=lambda x: x[1], reverse=True) + + data = {} + device_total_time = {} + device_self_time = {} + host_total_time = {} + host_self_time = {} + + if len(op_device_total_time) > 0: + device_total_time['title'] = 'Device Total Time (us)' + device_total_time['columns'] = [{'type': 'string', 'name': 'name'}, {'type': 'number', 'name': 'value'}] + device_total_time['rows'] = op_device_total_time + else: + device_total_time = None + + if len(op_device_self_time) > 0: + device_self_time['title'] = 'Device Self Time (us)' + device_self_time['columns'] = [{'type': 'string', 'name': 'name'}, {'type': 'number', 'name': 'value'}] + device_self_time['rows'] = op_device_self_time + else: + device_self_time = None + + if len(op_host_total_time) > 0: + host_total_time['title'] = 'Host Total Time (us)' + host_total_time['columns'] = [{'type': 'string', 'name': 'name'}, {'type': 'number', 'name': 'value'}] + host_total_time['rows'] = op_host_total_time + else: + host_total_time = None + + if len(op_host_self_time) > 0: + host_self_time['title'] = 'Host Self Time (us)' + host_self_time['columns'] = [{'type': 'string', 'name': 'name'}, {'type': 'number', 'name': 'value'}] + host_self_time['rows'] = op_host_self_time + else: + host_self_time = None + + data['device_total_time'] = device_total_time + data['device_self_time'] = device_self_time + data['host_total_time'] = host_total_time + data['host_self_time'] = host_self_time + + return data + + def _generate_op_table(self, op_list: Iterable[OperatorAgg], group_by_input_shape=False, call_stack=False): + show_gpu = self.profile_data.has_kernel or self.profile_data.has_memcpy_or_memset + + if group_by_input_shape: + stack_list_dict = self.profile_data.stack_lists_group_by_name_input + else: + stack_list_dict = self.profile_data.stack_lists_group_by_name + + op_list = sorted(op_list, + key=lambda x: x.self_device_duration if show_gpu else x.self_host_duration, + reverse=True) + + data = list() + result = { + 'metadata': { + 'sort': 'device_self_duration' if show_gpu else 'host_self_duration', + 'tooltips': { + 'tc_eligible': consts.TOOLTIP_OP_TC_ELIGIBLE, + 'tc_self_ratio': consts.TOOLTIP_OP_TC_SELF, + 'tc_total_ratio': consts.TOOLTIP_OP_TC_TOTAL + } + }, + 'data': data + } + for op in op_list: + # Whether device_duration & self_device_duration are accurate or not depends on the input tracing data. + row = dict() + row['name'] = op.name + if group_by_input_shape: + row['input_shape'] = op.input_shape + row['calls'] = op.calls + if show_gpu: + row['device_self_duration'] = round(op.self_device_duration) + row['device_total_duration'] = round(op.device_duration) + row['host_self_duration'] = round(op.self_host_duration) + row['host_total_duration'] = round(op.host_duration) + row['tc_eligible'] = 'Yes' if op.tc_eligible else 'No' + row['tc_self_ratio'] = round(100 * op.tc_self_ratio, 2) + row['tc_total_ratio'] = round(100 * op.tc_total_ratio, 2) + if call_stack: + row['call_stack'] = op.callstacks.pop() + else: + if group_by_input_shape: + key = op.name + '###' + str(op.input_shape) + else: + key = op.name + row['has_call_stack'] = key in stack_list_dict + data.append(row) + + return result + + def _generate_op_table_for_stack(self, group_by_input_shape: bool): + if group_by_input_shape: + stack_list_dict = self.profile_data.stack_lists_group_by_name_input + else: + stack_list_dict = self.profile_data.stack_lists_group_by_name + + result = dict() + for k, v in stack_list_dict.items(): + result[k] = self._generate_op_table(v, group_by_input_shape, True) + return result + + def _generate_kernel_op_table(self): + table = {} + result = { + 'metadata': { + 'sort': 'Total Duration (us)' + }, + 'data': table + } + table['columns'] = [{'type': 'string', 'name': 'Name'}, + {'type': 'string', 'name': 'Operator'}, + {'type': 'string', 'name': 'Grid'}, + {'type': 'string', 'name': 'Block'}, + {'type': 'number', 'name': 'Register Per Thread'}, + {'type': 'number', 'name': 'Shared Memory'}, + {'type': 'string', 'name': 'Kernel Uses Tensor Cores', + 'tooltip': consts.TOOLTIP_KERNEL_USES_TC}, + {'type': 'string', 'name': 'Op is Tensor Cores eligible', + 'tooltip': consts.TOOLTIP_KERNEL_OP_TC_ELIGIBLE}] + col_names = ['Calls', 'Total Duration (us)', 'Mean Duration (us)', 'Max Duration (us)', 'Min Duration (us)'] + for column in col_names: + table['columns'].append({'type': 'number', 'name': column}) + gpu_metrics_columns = self.profile_data.gpu_metrics_parser.get_gpu_metrics_columns() + table['columns'].extend(gpu_metrics_columns) + + table['rows'] = [] + kernel_list: List[KernelAggByNameOp] = sorted( + self.profile_data.kernel_list_groupby_name_op, key=lambda x: x.total_duration, reverse=True) + for agg_by_name_op in kernel_list: + kernel_op_row = [agg_by_name_op.name, agg_by_name_op.op_name, + str(agg_by_name_op.grid), str(agg_by_name_op.block), + str(agg_by_name_op.regs_per_thread or '0'), str(agg_by_name_op.shared_memory or '0'), + 'Yes' if agg_by_name_op.tc_used else 'No', + 'Yes' if agg_by_name_op.op_tc_eligible else 'No', + agg_by_name_op.calls, + agg_by_name_op.total_duration, round(agg_by_name_op.avg_duration), + agg_by_name_op.max_duration, agg_by_name_op.min_duration] + if self.profile_data.gpu_metrics_parser.has_blocks_per_sm: + kernel_op_row.append(round(agg_by_name_op.avg_blocks_per_sm, 2)) + if self.profile_data.gpu_metrics_parser.has_occupancy: + kernel_op_row.append(round(agg_by_name_op.avg_occupancy, 2)) + table['rows'].append(kernel_op_row) + return result + + def _generate_kernel_pie(self): + pie = {'columns': [{'type': 'string', 'name': 'name'}, {'type': 'number', 'name': 'value'}], 'rows': []} + for _id, (name, row) in enumerate(self.profile_data.kernel_stat.iterrows()): + pie['rows'].append([name, row['sum']]) + data = {'total': pie} + return data + + def _generate_kernel_table(self): + table = {} + result = { + 'metadata': { + 'sort': 'Total Duration (us)' + }, + 'data': table + } + table['columns'] = [{'type': 'string', 'name': 'Name'}, + {'type': 'string', 'name': 'Tensor Cores Used', + 'tooltip': consts.TOOLTIP_KERNEL_USES_TC}] + columns = ['count', 'sum', 'mean', 'max', 'min'] + round_digits = [0, 0, 0, 0, 0] + if self.profile_data.gpu_metrics_parser.has_blocks_per_sm: + columns.append('blocks_per_sm') + round_digits.append(2) + if self.profile_data.gpu_metrics_parser.has_occupancy: + columns.append('occupancy') + round_digits.append(2) + col_names = ['Calls', 'Total Duration (us)', 'Mean Duration (us)', 'Max Duration (us)', 'Min Duration (us)'] + for column in col_names: + table['columns'].append({'type': 'number', 'name': column}) + gpu_metrics_columns = self.profile_data.gpu_metrics_parser.get_gpu_metrics_columns() + table['columns'].extend(gpu_metrics_columns) + + table['rows'] = [] + for _id, (name, row) in enumerate(self.profile_data.kernel_stat.iterrows()): + kernel_row = [name, 'Yes' if row['tc_used'] else 'No'] + for i, column in enumerate(columns): + kernel_row.append(round(row[column]) if round_digits[i] == 0 + else round(row[column], round_digits[i])) + table['rows'].append(kernel_row) + return result + + def _generate_tc_pie(self): + pie = {'columns': [{'type': 'string', 'name': 'name'}, {'type': 'number', 'name': 'value'}], 'rows': []} + pie['rows'].append(['Using Tensor Cores', self.profile_data.tc_used_ratio]) + pie['rows'].append(['Not Using Tensor Cores', 1.0 - self.profile_data.tc_used_ratio]) + data = {'total': pie} + return data + + @staticmethod + def _get_gpu_info(device_props, gpu_id): + if (device_props is None) or (gpu_id >= len(device_props)) or (gpu_id < 0): + return None + + device_prop: Dict = device_props[gpu_id] + gpu_info = {} + name = device_prop.get('name') + if name is not None: + gpu_info['Name'] = name + + mem = device_prop.get('totalGlobalMem') + if mem is not None: + gpu_info['Memory'] = '{} GB'.format(round(float(mem) / 1024 / 1024 / 1024, 2)) + gpu_info['Memory Raw'] = mem + + major = device_prop.get('computeMajor') + minor = device_prop.get('computeMinor') + if major is not None and minor is not None: + gpu_info['Compute Capability'] = '{}.{}'.format(major, minor) + + return gpu_info + + +class DistributedRunGenerator(object): + def __init__(self, all_profile_data: Iterable[DistributedRunProfileData], span): + self.all_profile_data = all_profile_data + self.span = span + + def generate_run_profile(self): + profile_run = DistributedRunProfile(self.span) + profile_run.views.append(consts.DISTRIBUTED_VIEW) + profile_run.gpu_info = self._generate_gpu_info() + profile_run.steps_to_overlap = self._generate_overlap_graph() + profile_run.steps_to_wait = self._generate_wait_graph() + profile_run.comm_ops = self._generate_ops_table() + return profile_run + + def _generate_gpu_info(self): + # first key is node name, the second key is process id, the third key is GPU0/, + # the value is the gpu info json + result: Dict[str, Dict[str, Dict[str, Dict]]] = OrderedDict() + index = 0 + for data in sorted(self.all_profile_data, key=lambda x: x.worker): + if not data.device_props: + continue + + match = consts.NODE_PROCESS_PATTERN.match(data.worker) + if match: + node = match.group(1) + process_id = match.group(2) + else: + logger.warning('cannot parse node name from worker name {}'.format(data.worker)) + node = data.worker + process_id = index + index += 1 + if node not in result: + result[node] = OrderedDict() + + process_id = 'Process ' + str(process_id) + result[node][process_id] = OrderedDict() + for used_device in data.used_devices: + gpu_info = RunGenerator._get_gpu_info(data.device_props, used_device) + if gpu_info is not None: + result[node][process_id]['GPU'+str(used_device)] = gpu_info + + if result: + for k, v in result.items(): + result[k] = OrderedDict(sorted(v.items())) + return { + 'metadata': {'title': 'Device Information'}, + 'data': result + } + else: + return None + + def _generate_overlap_graph(self): + result = dict() + result['metadata'] = { + 'title': 'Computation/Communication Overview', + 'legends': ['Computation', 'Overlapping', 'Communication', 'Other'], + 'units': 'us' + } + steps_to_overlap: Dict[str, Dict[str, List[int]]] = OrderedDict() + steps_to_overlap['all'] = OrderedDict() + for data in self.all_profile_data: + steps_to_overlap['all'][data.worker] = [0, 0, 0, 0] + step_number = len(data.steps_names) + for i, step_name in enumerate(data.steps_names): + steps_to_overlap.setdefault(step_name, OrderedDict()) + costs = data.comm_overlap_costs[i] + steps_to_overlap[step_name][data.worker] = [ + costs.computation - costs.overlap, + costs.overlap, + costs.communication - costs.overlap, + costs.other + ] + steps_to_overlap['all'][data.worker] = [ + sum(x) for x in zip(steps_to_overlap['all'][data.worker], steps_to_overlap[step_name][data.worker])] + steps_to_overlap['all'][data.worker] = [x/step_number for x in steps_to_overlap['all'][data.worker]] + for k, v in steps_to_overlap.items(): + steps_to_overlap[k] = OrderedDict(sorted(v.items())) + result['data'] = steps_to_overlap + return result + + def _generate_wait_graph(self): + result = dict() + result['metadata'] = { + 'title': 'Synchronizing/Communication Overview', + 'legends': ['Data Transfer Time', 'Synchronizing Time'], + 'units': 'us' + } + steps_to_wait: Dict[str, Dict[str, List[int]]] = OrderedDict() + + steps_to_wait['all'] = OrderedDict() + for data in self.all_profile_data: + steps_to_wait['all'][data.worker] = [0, 0] + step_number = len(data.step_comm_stats.values()) + for step, comm_stats in data.step_comm_stats.items(): + steps_to_wait.setdefault(step, OrderedDict())[data.worker] = [ + comm_stats[1], + comm_stats[0]-comm_stats[1] + ] + steps_to_wait['all'][data.worker] = [ + sum(x) for x in zip(steps_to_wait['all'][data.worker], steps_to_wait[step][data.worker])] + steps_to_wait['all'][data.worker] = [x/step_number for x in steps_to_wait['all'][data.worker]] + + for k, v in steps_to_wait.items(): + steps_to_wait[k] = OrderedDict(sorted(v.items())) + result['data'] = steps_to_wait + return result + + def _generate_ops_table(self): + result = dict() + result['metadata'] = {'title': 'Communication Operations Stats'} + workers_to_comm_ops = OrderedDict() + # Ignore the span for distributed view + for data in self.all_profile_data: + table = {} + table['columns'] = [{'type': 'string', 'name': 'Name'}] + col_names = [ + 'Calls', + 'Total Size (bytes)', + 'Avg Size (bytes)', + 'Total Latency (us)', + 'Avg Latency (us)', + 'Data Transfer Time (us)', + 'Avg Data Transfer Time (us)' + ] + for column in col_names: + table['columns'].append({'type': 'number', 'name': column}) + table['rows'] = [] + for op, stats in data.total_comm_stats.items(): + row = [ + op, + stats[0], + stats[1], + round(stats[1]/stats[0]), + stats[2], + round(stats[2]/stats[0]), + stats[3], + round(stats[3]/stats[0]) + ] + table['rows'].append(row) + workers_to_comm_ops[data.worker] = table + result['data'] = OrderedDict(sorted(workers_to_comm_ops.items())) + return result diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/tensor_core.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/tensor_core.py new file mode 100644 index 0000000000000000000000000000000000000000..3a69cf70b881acc4588682fc4440cb5534541eb1 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/tensor_core.py @@ -0,0 +1,52 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# ------------------------------------------------------------------------- +class TC_Allowlist_Meta(type): + # Enable grammar sugar as 'v in TC_Allowlist'. + def __contains__(cls, item): + return cls.__contains__(item) + + +class TC_Allowlist(metaclass=TC_Allowlist_Meta): + # Refer to https://github.com/NVIDIA/PyProf/blob/fd1b2902e3306119eee40ba6b6e8b2f816920c29/pyprof/prof/tc.py#L19 + allowlist = ['h884', 's884', 'h1688', 's1688', 'hmma', 'i8816', '16816', + 'dgrad_1x1_stride_2x2', 'first_layer_wgrad_kernel', 'conv1x1', + 'conv2d_c1_k1', 'direct_group', 'xmma_implicit_gemm', + 'xmma_sparse_conv', 'xmma_warp_specialized_implicit_gemm', + 'xmma_gemm', 'xmma_sparse_gemm', 'c1688'] + + @classmethod + def __contains__(cls, item): + # If kernel name contains substring equal to any one in allowlist, then it uses tensor core. + for pattern in cls.allowlist: + if pattern in item: + return True + return False + + +class TC_OP_Allowlist(metaclass=TC_Allowlist_Meta): + # Refer to https://github.com/pytorch/pytorch/blob/69b2bf70f9c0e591ce5e566afa59e19618031ead/aten/src/ATen/autocast_mode.cpp#L290-L351 # noqa: E501 + allowlist = ['aten::_convolution', 'aten::conv1d', 'aten::conv2d', 'aten::conv3d', 'aten::conv_tbc', + 'aten::conv_transpose1d', 'aten::conv_transpose2d', 'aten::conv_transpose3d', + 'aten::convolution', 'aten::cudnn_convolution', 'aten::cudnn_convolution_transpose', + 'aten::prelu', 'aten::addmm', 'aten::addmv', 'aten::addr', + 'aten::matmul', 'aten::mm', 'aten::mv', + 'aten::linear', 'aten::addbmm', 'aten::baddbmm', 'aten::bmm', + 'aten::chain_matmul', 'aten::linalg_multi_dot', + 'aten::_thnn_fused_lstm_cell', 'aten::_thnn_fused_gru_cell', 'aten::lstm_cell', + 'aten::gru_cell', 'aten::rnn_tanh_cell', 'aten::rnn_relu_cell', + # The backward ops are got by running above ops' backward + # and recording whether it launched kernels. + 'CudnnConvolutionBackward', 'BmmBackward0', + 'aten::cudnn_convolution_transpose_backward', 'CudnnConvolutionTransposeBackward', + 'MmBackward', 'aten::cudnn_convolution_backward_weight', 'aten::addmm_', + 'AddmvBackward', 'MvBackward', + 'aten::cudnn_convolution_transpose_backward_weight', + 'aten::cudnn_convolution_transpose_backward_input', + 'AddmmBackward', 'aten::cudnn_convolution_backward_input', + 'AddbmmBackward', 'aten::cudnn_convolution_backward'] + + @classmethod + def __contains__(cls, item): + # If operator name equals to any one in allowlist, then it is tensor core eligible. + return item in cls.allowlist diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/tensor_cores_parser.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/tensor_cores_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..e2372d9adc05e96f274300e4d91a23551ed555ec --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/tensor_cores_parser.py @@ -0,0 +1,77 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# ------------------------------------------------------------------------- +from typing import Dict, Iterable, List + +from .. import consts +from .node import OperatorNode + + +class TensorCoresParser: + def __init__(self, tc_ratio: List[float], tc_eligible_ops_kernel_ratio: float): + # For calculating Tensor Cores time ratio per GPU. + self.tc_ratio = tc_ratio + self.tc_eligible_ops_kernel_ratio = tc_eligible_ops_kernel_ratio + + @classmethod + def parse_events(cls, tid2tree: Dict[str, OperatorNode], ops: Iterable[OperatorNode], gpu_ids: Iterable[int]): + tc_ratio = cls._calculate_tc_ratio(ops, gpu_ids) + tc_eligible_ops_kernel_ratio = cls._get_tc_eligible_ops_kernel_ratio(tid2tree, ops) + return cls(tc_ratio, tc_eligible_ops_kernel_ratio) + + @staticmethod + def _calculate_tc_ratio(ops: Iterable[OperatorNode], gpu_ids: Iterable[int]): + tc_ratio: List[float] = [None] * consts.MAX_GPU_PER_NODE + tc_time = [0] * consts.MAX_GPU_PER_NODE + total_time = [0] * consts.MAX_GPU_PER_NODE + has_kernel = False + for op in ops: + for rt in op.runtimes: + # 'CallTreeRoot' & 'dummy' kernels are launched out of profiler step, so don't count them. + if not (op.name == 'CallTreeRoot' and rt.name == 'dummy'): + for k in rt.get_kernels(): + has_kernel = True + dur = k.end_time - k.start_time + is_tc_used = k.tc_used + if is_tc_used: + tc_time[k.device_id] += dur + total_time[k.device_id] += dur + if has_kernel: # If no kernel, then keep all self.tc_ratio as None. + for gpu_id in gpu_ids: + if total_time[gpu_id] > 0: + tc_ratio[gpu_id] = tc_time[gpu_id] / total_time[gpu_id] + else: + tc_ratio[gpu_id] = 0.0 + return tc_ratio + + @staticmethod + def _get_bottom_tc_eligible_operators(op_tree_node: OperatorNode): + ops: List[OperatorNode] = [] + for child in op_tree_node.children: + child_ops = TensorCoresParser._get_bottom_tc_eligible_operators(child) + ops.extend(child_ops) + # TC-eligible ops which have children TC-eligible ops will not be regarded as 'bottom'. + if op_tree_node.tc_eligible and len(ops) == 0: + ops.append(op_tree_node) + return ops + + @staticmethod + def _get_tc_eligible_ops_kernel_ratio(tid2tree: Dict[int, OperatorNode], ops: Iterable[OperatorNode]): + def sum_self_kernel_time(ops: Iterable[OperatorNode]): + sum_time = 0 + for op in ops: + for rt in op.runtimes: + # 'CallTreeRoot' & 'dummy' kernels are launched out of profiler step, so don't count them. + if not (op.name == 'CallTreeRoot' and rt.name == 'dummy'): + for k in rt.get_kernels(): + sum_time += k.end_time - k.start_time + return sum_time + + ops_bottom_tc_eligible = [] + for root in tid2tree.values(): + ops_bottom_tc_eligible.extend(TensorCoresParser._get_bottom_tc_eligible_operators(root)) + ops_bottom_tc_eligible_kernel_sum = sum_self_kernel_time(ops_bottom_tc_eligible) + ops_kernel_sum = sum_self_kernel_time(ops) + tc_eligible_ops_kernel_ratio = ops_bottom_tc_eligible_kernel_sum / ops_kernel_sum \ + if ops_kernel_sum > 0 else 0.0 + return tc_eligible_ops_kernel_ratio diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/trace.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/trace.py new file mode 100644 index 0000000000000000000000000000000000000000..5f0da88055f21ccf8f2ed782e71aac4edb214fe6 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/trace.py @@ -0,0 +1,231 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# -------------------------------------------------------------------------- +from enum import IntEnum +from typing import Dict, Optional + +from .. import utils + +__all__ = ['EventTypes', 'create_event'] + +logger = utils.get_logger() + + +class DeviceType(IntEnum): + CPU = 0 + CUDA = 1 + + +class EventTypes(object): + TRACE = 'Trace' + OPERATOR = 'Operator' + PROFILER_STEP = 'ProfilerStep' + RUNTIME = 'Runtime' + KERNEL = 'Kernel' + MEMCPY = 'Memcpy' + MEMSET = 'Memset' + PYTHON = 'Python' + MEMORY = 'Memory' + PYTHON_FUNCTION = 'python_function' + MODULE = 'Module' + PL_PROFILE = 'pl_profile' + PL_MODULE = 'pl_module' + + +EventTypeMap = { + 'Trace': EventTypes.TRACE, + 'cpu_op': EventTypes.OPERATOR, + 'Operator': EventTypes.OPERATOR, + 'Runtime': EventTypes.RUNTIME, + 'Kernel': EventTypes.KERNEL, + 'Memcpy': EventTypes.MEMCPY, + 'gpu_memcpy': EventTypes.MEMCPY, + 'Memset': EventTypes.MEMSET, + 'gpu_memset': EventTypes.MEMSET, + 'Python': EventTypes.PYTHON, + 'Memory': EventTypes.MEMORY, + 'python_function': EventTypes.PYTHON_FUNCTION +} + + +class BaseEvent(object): + def __init__(self, type, data): + self.type: str = type + self.name: str = data.get('name') + self.ts: int = data.get('ts') + self.pid: int = data.get('pid') + self.tid: int = data.get('tid') + self.args: Dict = data.get('args', {}) + + +class DurationEvent(BaseEvent): + def __init__(self, type, data): + super().__init__(type, data) + self.category: str = data.get('cat', '') + self.duration: int = data.get('dur') + + extern_id: Optional[int] = self.args.get('external id') + if extern_id is None: + extern_id = self.args.get('External id') + self.external_id = extern_id + self.correlation_id: Optional[int] = self.args.get('correlation') + + +class KernelEvent(DurationEvent): + def __init__(self, type, data): + super().__init__(type, data) + self.occupancy = self.args.get('est. achieved occupancy %') + self.blocks_per_sm = self.args.get('blocks per SM') + self.grid = self.args.get('grid') + self.block = self.args.get('block') + self.regs_per_thread = self.args.get('registers per thread') + self.shared_memory = self.args.get('shared memory') + self.device_id = self.args.get('device') + + +class OperatorEvent(DurationEvent): + def __init__(self, type, data): + super().__init__(type, data) + self.callstack = self.args.get('Call stack') + self.input_type = self.args.get('Input type') + + shape = self.args.get('Input Dims') + if shape is None: + # Setting shape to '[]' other None is to align with autograd result + shape = self.args.get('Input dims', []) + self.input_shape = shape + + +class ProfilerStepEvent(OperatorEvent): + def __init__(self, data): + super().__init__(EventTypes.PROFILER_STEP, data) + # torch.profiler.profile.step will invoke record_function with name like 'ProfilerStep#5' + self.step: int = int(self.name.split('#')[1]) + + +class MemoryEvent(BaseEvent): + def __init__(self, type, data): + super().__init__(type, data) + self.scope: str = data.get('s', '') + self.device_id: int = self.args.get('Device Id') + dtype = self.args.get('Device Type') + if dtype is not None: + try: + dtype = DeviceType(dtype) + except ValueError: + dtype = None + + self.device_type: DeviceType = dtype + + @property + def addr(self): + return self.args.get('Addr') + + @property + def bytes(self): + return self.args.get('Bytes', 0) + + @property + def total_allocated(self): + return self.args.get('Total Allocated', float('nan')) + + @property + def total_reserved(self): + return self.args.get('Total Reserved', float('nan')) + + +class PythonFunctionEvent(DurationEvent): + def __init__(self, type, data): + super().__init__(type, data) + self.python_id: int = self.args.get('Python id') + self.python_parent_id: int = self.args.get('Python parent id') + + +class ModuleEvent(PythonFunctionEvent): + def __init__(self, data): + super().__init__(EventTypes.MODULE, data) + self.module_id: int = self.args.get('Python module id') + + +class PLProfileEvent(DurationEvent): + def __init__(self, data): + super().__init__(EventTypes.PL_PROFILE, data) + self.name = self.name.replace('[pl][profile]', '') + + +class PLModuleEvent(DurationEvent): + def __init__(self, data): + super().__init__(EventTypes.PL_MODULE, data) + self.module_id = 0 # just to be compatible with ModuleEvent processing + self.name = self.name.replace('[pl][module]', '') + # self.shape = self.name[:self.name.rfind(']')+1] + # self.name = self.name[self.name.rfind(']')+1:] + self.module_type = self.name[:self.name.find(': ')] + self.name = self.name[self.name.find(': ')+2:] + + +def create_event(event, is_pytorch_lightning) -> Optional[BaseEvent]: + try: + type = event.get('ph') + if type == 'X': + return create_trace_event(event, is_pytorch_lightning) + elif type == 'i' and event.get('name') == '[memory]': + return MemoryEvent(EventTypes.MEMORY, event) + else: + return None + except Exception as ex: + logger.warning('Failed to parse profile event. Exception=%s. Event=%s', ex, event, exc_info=True) + raise + + +def create_trace_event(event, is_pytorch_lightning) -> Optional[BaseEvent]: + category = event.get('cat') + event_type = EventTypeMap.get(category) + if event_type == EventTypes.OPERATOR: + name = event.get('name') + if name and name.startswith('ProfilerStep#'): + return ProfilerStepEvent(event) + if is_pytorch_lightning: + if name and name.startswith('[pl][profile]'): + return PLProfileEvent(event) + elif name and name.startswith('[pl][module]'): + return PLModuleEvent(event) + return OperatorEvent(event_type, event) + elif event_type == EventTypes.PYTHON: + return OperatorEvent(event_type, event) + elif event_type == EventTypes.KERNEL: + return KernelEvent(event_type, event) + elif event_type == EventTypes.PYTHON_FUNCTION: + if is_pytorch_lightning: + return None + args = event.get('args') + if args and args.get('Python module id') is not None: + return ModuleEvent(event) + else: + return PythonFunctionEvent(event_type, event) + elif event_type is not None: + return DurationEvent(event_type, event) + else: + return None + + +def create_association_events(events) -> Dict[int, int]: + forward_map = {} + backward_map = {} + + result = {} + for e in events: + ph = e.get('ph') + id = e['id'] + ts = e['ts'] + if ph == 's': + forward_map[id] = ts + elif ph == 'f': + backward_map[id] = ts + + for id, ts in forward_map.items(): + backward_ts = backward_map.get(id) + if backward_ts is not None: + result[ts] = backward_ts + + return result diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/run.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/run.py new file mode 100644 index 0000000000000000000000000000000000000000..e54109eeb0628ff4de8313e3b29a1a76c9986c97 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/run.py @@ -0,0 +1,485 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# -------------------------------------------------------------------------- +from collections import defaultdict +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union + +from . import consts, utils +from .profiler.diffrun import compare_op_tree, diff_summary +from .profiler.memory_parser import MemoryMetrics, MemoryRecord, MemorySnapshot +from .profiler.module_op import Stats +from .profiler.node import OperatorNode +from .utils import Canonicalizer, DisplayRounder + +logger = utils.get_logger() + + +class Run(object): + """ A profiler run. For visualization purpose only. + May contain profiling results from multiple workers. E.g. distributed scenario. + """ + + def __init__(self, name, run_dir): + self.name = name + self.run_dir = run_dir + self.profiles: Dict[Tuple[str, str], RunProfile] = {} + + @property + def workers(self): + # get full worker list and remove the duplicated + worker_list, _ = zip(*self.profiles.keys()) + worker_list = sorted(list(dict.fromkeys(worker_list))) + return worker_list + + @property + def views(self) -> List[consts.View]: + view_set = set() + for profile in self.profiles.values(): + view_set.update(profile.views) + return sorted(list(view_set), key=lambda x: x.id) + + def get_workers(self, view): + worker_set = set() + for profile in self.profiles.values(): + for v in profile.views: + if v.display_name == view: + worker_set.add(profile.worker) + break + return sorted(list(worker_set)) + + def get_spans(self, worker=None): + if worker is not None: + spans = [s for w, s in self.profiles.keys() if w == worker] + else: + spans = [s for _, s in self.profiles.keys()] + + spans = list(set(spans)) + if len(spans) == 1 and spans[0] is None: + return None + else: + return sorted(spans) + + def add_profile(self, profile: Union['DistributedRunProfile', 'RunProfile']): + span = profile.span + if span is None: + span = 'default' + else: + span = str(span) + self.profiles[(profile.worker, span)] = profile + + def get_profile(self, worker, span) -> Union['DistributedRunProfile', 'RunProfile']: + if worker is None: + raise ValueError('the worker parameter is mandatory') + + if len(self.profiles) == 0: + return None + + return self.profiles.get((worker, span), None) + + def get_profiles(self, *, worker=None, span=None) \ + -> Optional[Union[List['RunProfile'], List['DistributedRunProfile']]]: + # Note: we could not use if span to check it is None or not + # since the span 0 will be skipped at this case. + if worker is not None and span is not None: + return self.profiles.get((worker, span), None) + elif worker is not None: + return [p for (w, s), p in self.profiles.items() if worker == w] + elif span is not None: + return [p for (w, s), p in self.profiles.items() if span == s] + else: + return self.profiles.values() + + +class RunProfile(object): + """ Cooked profiling result for a worker. For visualization purpose only. + """ + + def __init__(self, worker, span): + self.worker = worker + self.span = span + self.views: List[consts.View] = [] + self.is_pytorch_lightning = False + self.has_runtime = False + self.has_kernel = False + self.has_communication = False + self.has_memcpy_or_memset = False + self.profiler_start_ts = float('inf') + self.overview = None + self.operation_pie_by_name = None + self.operation_table_by_name = None + self.operation_stack_by_name: Dict = None + self.operation_pie_by_name_input = None + self.operation_table_by_name_input = None + self.operation_stack_by_name_input: Dict = None + self.kernel_op_table = None + self.kernel_pie = None + self.kernel_table = None + self.tc_pie = None + self.trace_file_path: str = None + + self.gpu_metrics = None + + self.gpu_summary = None + self.gpu_tooltip = None + + # for memory stats and curve + self.memory_snapshot: Optional[MemorySnapshot] = None + self.tid2tree: Dict[int, OperatorNode] = None + self.pl_tid2tree: Dict[int, OperatorNode] = None + + self.module_stats: Optional[List(Stats)] = None + self.pl_module_stats: Optional[List(Stats)] = None + + def append_gpu_metrics(self, raw_data: bytes): + counter_json_str = ', {}'.format(', '.join(self.gpu_metrics)) + counter_json_bytes = bytes(counter_json_str, 'utf-8') + + raw_data_without_tail = raw_data[: raw_data.rfind(b']')] + raw_data = b''.join([raw_data_without_tail, counter_json_bytes, b']}']) + + import gzip + raw_data = gzip.compress(raw_data, 1) + return raw_data + + @staticmethod + def _filtered_by_ts(events: Iterable[MemoryRecord], start_ts, end_ts): + """Returns time-ordered events of memory allocation and free""" + if start_ts is not None and end_ts is not None: + events = [e for e in events if start_ts <= e.ts and e.ts <= end_ts] + elif start_ts is not None: + events = [e for e in events if start_ts <= e.ts] + elif end_ts is not None: + events = [e for e in events if e.ts <= end_ts] + + return events + + def get_memory_stats(self, start_ts=None, end_ts=None, memory_metric='K'): + cano = Canonicalizer(memory_metric=memory_metric) + round = DisplayRounder(ndigits=2) + + stats = self.memory_snapshot.get_memory_statistics(self.tid2tree, start_ts=start_ts, end_ts=end_ts) + + result = { + 'metadata': { + 'title': 'Memory View', + 'default_device': 'CPU', + 'search': 'Operator Name', + 'sort': f'Self Size Increase ({cano.memory_metric})' + }, + 'columns': [ + {'name': 'Operator Name', 'type': 'string'}, + {'name': 'Calls', 'type': 'number', 'tooltip': '# of calls of the operator.'}, + {'name': f'Size Increase ({cano.memory_metric})', 'type': 'number', + 'tooltip': 'The memory increase size include all children operators.'}, + {'name': f'Self Size Increase ({cano.memory_metric})', 'type': 'number', + 'tooltip': 'The memory increase size associated with the operator itself.'}, + {'name': 'Allocation Count', 'type': 'number', + 'tooltip': 'The allocation count including all chidren operators.'}, + {'name': 'Self Allocation Count', 'type': 'number', + 'tooltip': 'The allocation count belonging to the operator itself.'}, + {'name': f'Allocation Size ({cano.memory_metric})', 'type': 'number', + 'tooltip': 'The allocation size including all children operators.'}, + {'name': f'Self Allocation Size ({cano.memory_metric})', 'type': 'number', + 'tooltip': ('The allocation size belonging to the operator itself.\n' + 'It will sum up all allocation bytes without considering the memory free.')}, + ], + 'rows': {} + } + + for name in stats: + these_rows = [] + result['rows'][name] = these_rows + + memory = stats[name] + for op_name, stat in sorted(memory.items()): + these_rows.append([ + op_name, + stat[6], + round(cano.convert_memory(stat[MemoryMetrics.IncreaseSize])), + round(cano.convert_memory(stat[MemoryMetrics.SelfIncreaseSize])), + stat[MemoryMetrics.AllocationCount], + stat[MemoryMetrics.SelfAllocationCount], + round(cano.convert_memory(stat[MemoryMetrics.AllocationSize])), + round(cano.convert_memory(stat[MemoryMetrics.SelfAllocationSize])), + ]) + + for dev_name in sorted(stats.keys()): + if dev_name.startswith('GPU'): + result['metadata']['default_device'] = dev_name + break + + return result + + def get_memory_curve( + self, + time_metric: str = 'ms', + memory_metric: str = 'K', + patch_for_step_plot=True): + def get_curves_and_peaks(records: List[MemoryRecord], cano: Canonicalizer): + """Inputs: + records: Sorted list of MemoryRecord + + For example: + ```py + { + 'CPU': [# Timestamp, Total Allocated, Total Reserved, Device Total Memory, operator + [1, 4, 4, 1000000, 'aten::add'], + [2, 16, 16, 1000000, "aten::empty], + [4, 4, 16, 1000000, '...'], + ], + 'GPU0': ... + } + ```""" + curves = defaultdict(list) + peaks = defaultdict(float) + for r in records: + if r.addr is None: + continue + dev = r.device_name + ts = r.ts + ta = r.total_allocated + tr = r.total_reserved + + if ta != ta or tr != tr: # isnan + continue + + curves[dev].append([ + cano.convert_time(ts - self.profiler_start_ts), + cano.convert_memory(ta), + cano.convert_memory(tr), + ]) + peaks[dev] = max(peaks[dev], ta) + + for dev in curves: + if len(curves[dev]) == 0: + del curves[dev] + del peaks[dev] + + return curves, peaks + + # NOTE: this should have been occured in frontend + def patch_curves_for_step_plot(curves: Dict[str, List]): + # For example, if a curve is [(0, 0), (1, 1), (2,2)], the line plot + # is a stright line. Interpolating it as [(0, 0), (1, 0), (1, 1), + # (2,1) (2,2)], then the line plot will work as step plot. + new_curves = defaultdict(list) + for dev, curve in curves.items(): + new_curve = [] + for i, p in enumerate(curve): + if i != 0: + new_curve.append(p[:1] + new_curve[-1][1:]) + new_curve.append(p) + new_curves[dev] = new_curve + return new_curves + + cano = Canonicalizer(time_metric, memory_metric) + + curves, peaks = get_curves_and_peaks(self.memory_snapshot.memory_records, cano) + if patch_for_step_plot: + curves = patch_curves_for_step_plot(curves) + peaks_formatted = {} + totals = {} + for dev, value in peaks.items(): + peaks_formatted[dev] = 'Peak Memory Usage: {:.1f}{}'.format(cano.convert_memory(value), cano.memory_metric) + if dev != 'CPU': + try: + totals[dev] = cano.convert_memory(self.gpu_infos[int(dev[3:])]['Memory Raw']) + except BaseException: + pass + + devices: List[str] = sorted(list(curves.keys())) + default_device = 'CPU' + for dev in devices: + if dev.startswith('GPU'): + default_device = dev + break + + return { + 'metadata': { + 'default_device': default_device, + 'devices': devices, + 'peaks': peaks_formatted, + 'totals': totals, + 'first_ts': self.profiler_start_ts, + 'time_metric': cano.time_metric, + 'memory_metric': cano.memory_metric, + 'time_factor': cano.time_factor, + 'memory_factor': cano.memory_factor, + }, + 'columns': [ + {'name': f'Time ({cano.time_metric})', 'type': 'number', 'tooltip': 'Time since profiler starts.'}, + {'name': f'Allocated ({cano.memory_metric})', 'type': 'number', 'tooltip': 'Total memory in use.'}, + {'name': f'Reserved ({cano.memory_metric})', 'type': 'number', + 'tooltip': 'Total reserved memory by allocator, both used and unused.'}, + ], + 'rows': curves, + } + + def get_memory_events( + self, + start_ts=None, + end_ts=None, + time_metric: str = 'ms', + memory_metric: str = 'K'): + def get_op_name_or_ctx(record: MemoryRecord): + name = record.op_name_or_unknown + if name.startswith('aten::empty') and record.parent_op_name: + # aten::empty can be treated as the 'malloc' in pytorch + name = f'{record.parent_op_name} ({name})' + return name + + cano = Canonicalizer(time_metric=time_metric, memory_metric=memory_metric) + round = DisplayRounder(ndigits=2) + + profiler_start_ts = self.profiler_start_ts + memory_records = RunProfile._filtered_by_ts(self.memory_snapshot.memory_records, start_ts, end_ts) + + events = defaultdict(list) + alloc = {} # allocation events may or may not have paired free event + free = {} # free events that does not have paired alloc event + prev_ts = float('-inf') # ensure ordered memory records is ordered + for i, r in enumerate(memory_records): + if r.addr is None: + # profile json data prior to pytorch 1.10 do not have addr + # we should ignore them + continue + assert prev_ts <= r.ts + prev_ts = r.ts + addr = r.addr + size = r.bytes + if r.is_allocation: + # to be matched with a release event + alloc[addr] = i + else: + if addr in alloc: + alloc_r = memory_records[alloc[addr]] + alloc_ts = alloc_r.ts + free_ts = r.ts + events[alloc_r.device_name].append([ + get_op_name_or_ctx(alloc_r), + round(cano.convert_memory(-size)), + round(cano.convert_time(alloc_ts - profiler_start_ts)), + round(cano.convert_time(free_ts - profiler_start_ts)), + round(cano.convert_time(free_ts - alloc_ts)), + ]) + del alloc[addr] + else: + if addr in free: + logger.warning(f'Address {addr} is freed multiple times') + free[addr] = i + + for i in alloc.values(): + r = memory_records[i] + events[r.device_name].append([ + get_op_name_or_ctx(r), + round(cano.convert_memory(r.bytes)), + round(cano.convert_time(r.ts - profiler_start_ts)), + None, + None, + ]) + + for i in free.values(): + r = memory_records[i] + events[r.device_name].append([ + get_op_name_or_ctx(r), + round(cano.convert_memory(-r.bytes)), + None, + round(cano.convert_time(r.ts - profiler_start_ts)), + None, + ]) + + default_device = 'CPU' + for dev_name in sorted(events.keys()): + if dev_name.startswith('GPU'): + default_device = dev_name + break + + return { + 'metadata': { + 'title': 'Memory Events', + 'default_device': default_device, + }, + 'columns': [ + {'name': 'Operator', 'type': 'string', 'tooltip': ''}, + {'name': f'Size ({cano.memory_metric})', 'type': 'number', 'tooltip': ''}, + {'name': f'Allocation Time ({cano.time_metric})', 'type': 'number', 'tooltip': ''}, + {'name': f'Release Time ({cano.time_metric})', 'type': 'number', 'tooltip': ''}, + {'name': f'Duration ({cano.time_metric})', 'type': 'number', 'tooltip': ''}, + ], + 'rows': events, # in the form of { 'CPU': [...], 'GPU0': [...], ... } + } + + def get_module_view(self): + if self.is_pytorch_lightning and self.pl_module_stats: + module_stats = self.pl_module_stats + elif self.module_stats: + module_stats = self.module_stats + else: + return None + + result = { + 'columns': [ + {'name': 'Module Name', 'type': 'string', 'key': 'name'}, + {'name': 'Occurences', 'type': 'number', 'key': 'occurences'}, + {'name': 'Operators', 'type': 'number', 'key': 'operators'}, + {'name': 'Host Total Time', 'type': 'number', 'key': 'host_duration'}, + {'name': 'Host Self Time', 'type': 'number', 'key': 'self_host_duration'}, + {'name': 'Device Total Time', 'type': 'number', 'key': 'device_duration'}, + {'name': 'Device Self Time', 'type': 'number', 'key': 'self_device_duration'} + ], + 'data': [] + } + + def process_modules_stats(parent: List[Any], modules_stats: List[Stats]): + for stats in modules_stats: + d = stats._asdict() + d['children'] = [] + parent.append(d) + process_modules_stats(d['children'], stats.children) + + process_modules_stats(result['data'], module_stats) + return result + + def get_operator_tree(self): + if self.is_pytorch_lightning: + root = next(iter(self.pl_tid2tree.values())) + else: + root = next(iter(self.tid2tree.values())) + + result = [] + + def traverse_node(parent: List, node: OperatorNode): + d = { + 'name': node.name, + 'start_time': node.start_time, + 'end_time': node.end_time, + 'type': node.type, + 'tid': node.tid, + 'children': [] + } + parent.append(d) + for child in node.children: + traverse_node(d['children'], child) + traverse_node(result, root) + return result[0] + + def compare_run(self, exp: 'RunProfile'): + base_root = next(iter(self.tid2tree.values())) + exp_root = next(iter(exp.tid2tree.values())) + diff_root = compare_op_tree(base_root, exp_root) + diff_stats = diff_summary(diff_root) + return diff_stats + + +class DistributedRunProfile(object): + """ Profiling all workers in a view. + """ + + def __init__(self, span: str): + self.worker = 'All' + self.span = span + self.views = [] + self.gpu_info = None + self.steps_to_overlap = None + self.steps_to_wait = None + self.comm_ops = None diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/static/index.html b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/static/index.html new file mode 100644 index 0000000000000000000000000000000000000000..66f6aca5f320396a436b341c587b14be6ef2612c --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/static/index.html @@ -0,0 +1,2 @@ +
    \ No newline at end of file diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/static/index.js b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/static/index.js new file mode 100644 index 0000000000000000000000000000000000000000..ddb69f41d2f149a7847fd267e4cdb8a576334cc4 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/static/index.js @@ -0,0 +1,3 @@ +export async function render() { + document.location.href = 'index.html'; +} diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/static/trace_embedding.html b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/static/trace_embedding.html new file mode 100644 index 0000000000000000000000000000000000000000..7b5c2bb65631ed59a148d9b1e5ee98e2b7a0af96 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/static/trace_embedding.html @@ -0,0 +1,104 @@ + + + + + + + + + + + + + + + + diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/static/trace_viewer_full.html b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/static/trace_viewer_full.html new file mode 100644 index 0000000000000000000000000000000000000000..15169a4572b546a7ba0f35e870bae528de913773 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/static/trace_viewer_full.html @@ -0,0 +1,10174 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/utils.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..c40ce9befc30b1f894a12e7554b1ee4f325ae605 --- /dev/null +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/utils.py @@ -0,0 +1,122 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# -------------------------------------------------------------------------- +import logging +import math +import os +import time +from contextlib import contextmanager +from math import pow + +from . import consts + + +def get_logging_level(): + log_level = os.environ.get('TORCH_PROFILER_LOG_LEVEL', 'INFO').upper() + if log_level not in logging._levelToName.values(): + log_level = logging.getLevelName(logging.INFO) + return log_level + + +logger = None + + +def get_logger(): + global logger + if logger is None: + logger = logging.getLogger(consts.PLUGIN_NAME) + logger.setLevel(get_logging_level()) + return logger + + +def is_chrome_trace_file(path): + return consts.WORKER_PATTERN.match(path) + + +def href(text, url): + """"return html formatted hyperlink string + + Note: + target="_blank" causes this link to be opened in new tab if clicked. + """ + return f'{text}' + + +class Canonicalizer: + def __init__( + self, + time_metric='us', + memory_metric='B', + *, + input_time_metric='us', + input_memory_metric='B'): + # raw timestamp is in microsecond + # https://github.com/pytorch/pytorch/blob/v1.9.0/torch/csrc/autograd/profiler_kineto.cpp#L33 + time_metric_to_factor = { + 'us': 1, + 'ms': 1e3, + 's': 1e6, + } + # raw memory is in bytes + memory_metric_to_factor = { + 'B': pow(1024, 0), + 'KB': pow(1024, 1), + 'MB': pow(1024, 2), + 'GB': pow(1024, 3), + } + + # canonicalize the memory metric to a string + self.canonical_time_metrics = { + 'micro': 'us', 'microsecond': 'us', 'us': 'us', + 'milli': 'ms', 'millisecond': 'ms', 'ms': 'ms', + '': 's', 'second': 's', 's': 's', + } + # canonicalize the memory metric to a string + self.canonical_memory_metrics = { + '': 'B', 'B': 'B', + 'K': 'KB', 'KB': 'KB', + 'M': 'MB', 'MB': 'MB', + 'G': 'GB', 'GB': 'GB', + } + + self.time_metric = self.canonical_time_metrics[time_metric] + self.memory_metric = self.canonical_memory_metrics[memory_metric] + + # scale factor scale input to output + self.time_factor = time_metric_to_factor[self.canonical_time_metrics[input_time_metric]] /\ + time_metric_to_factor[self.time_metric] + self.memory_factor = memory_metric_to_factor[self.canonical_memory_metrics[input_memory_metric]] /\ + memory_metric_to_factor[self.memory_metric] + + def convert_time(self, t): + return self.time_factor * t + + def convert_memory(self, m): + return self.memory_factor * m + + +class DisplayRounder: + """Round a value for display purpose.""" + + def __init__(self, ndigits): + self.ndigits = ndigits + self.precision = pow(10, -ndigits) + + def __call__(self, v: float): + _v = abs(v) + if _v >= self.precision or v == 0: + return round(v, 2) + else: + ndigit = abs(math.floor(math.log10(_v))) + return round(v, ndigit) + + +@contextmanager +def timing(description: str, force: bool = False) -> None: + if force or os.environ.get('TORCH_PROFILER_BENCHMARK', '0') == '1': + start = time.time() + yield + elapsed_time = time.time() - start + logger.info(f'{description}: {elapsed_time}') + else: + yield