diff --git a/.python-version b/.python-version new file mode 100644 index 0000000000000000000000000000000000000000..c8cfe3959183f8e9a50f83f54cd723f2dc9c252d --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.10 diff --git a/Jenkinsfile b/Jenkinsfile new file mode 100644 index 0000000000000000000000000000000000000000..2011009c47dba8dd904b230e7729f20613ac579d --- /dev/null +++ b/Jenkinsfile @@ -0,0 +1,289 @@ +pipeline { + agent { + kubernetes { + cloud 'ci-kubernets' + inheritFrom 'ci-mcTileLang' + yaml """ +apiVersion: v1 +kind: Pod +metadata: + namespace: ci +spec: + containers: + - name: jnlp + image: harbor-jiajia.mxcr.io/github-runner/maca-pytorch-jenkins-runner:3.5.3.6-torch2.8-py312-ubuntu24.04-amd64 + resources: + requests: + memory: "64Gi" + cpu: 24 + metax-tech.com/gpu: 1 + limits: + memory: "64Gi" + cpu: 24 + metax-tech.com/gpu: 1 +""" + } + } + + options { + giteeConnection('mcTileLang-MetaXGPU') + } + + environment { + GITEE_OWNER = "MetaX-MACA" + GITEE_REPO = "mcTileLang" + GITEE_CHECK_NAME = "Jenkins-metax" + + CLANG_TIDY_CMAKE_OPTIONS = "-DCMAKE_EXPORT_COMPILE_COMMANDS=ON" + PYTHONDEVMODE = "1" + PYTHONUNBUFFERED = "1" + PYTHONPATH = "" + PIP_USER = "" + COLUMNS = "100" + FORCE_COLOR = "1" + CLICOLOR_FORCE = "1" + UV_INDEX_STRATEGY = "unsafe-best-match" + UV_HTTP_TIMEOUT = "600" + XDG_CACHE_HOME = "${WORKSPACE}/.cache" + PIP_CACHE_DIR = "${WORKSPACE}/.cache/pip" + UV_CACHE_DIR = "${WORKSPACE}/.cache/uv" + PRE_COMMIT_HOME = "${WORKSPACE}/.cache/pip/.pre-commit" + + LINT_PYTHON_VERSION = "3.10" + PYTEST_PYTHON_VERSIONS = "3.12" + UV_INDEX = "https://mirrors.aliyun.com/pypi/simple" + TOKEN = credentials('984163ee-7976-44fa-af18-d830f664f449') + } + + stages { + stage('Test Pipeline Status Callback') { + steps { + sh ''' + curl -X POST \ + -H "Authorization: Bearer $TOKEN" \ + -H "Content-Type: application/json" \ + https://gitee.com/api/v5/repos/owner/repo/check-runs \ + -d '{ + "name":"Jenkins检查", + "head_sha":"'$GIT_COMMIT'", + "status":"in_progress" + }' + ''' + } + } + stage('Prepare UV Environments') { + steps { + script { + sh """ + export PATH="\${HOME}/.local/bin:\$PATH" + if ! command -v uv &>/dev/null; then + wget -qO- https://uv.agentsmirror.com/install-cn.sh | sh + fi + uv venv .venv_lint --python ${env.LINT_PYTHON_VERSION} --allow-existing + uv venv .venv_test --python ${env.PYTEST_PYTHON_VERSIONS} --allow-existing + """ + } + } + } + stage('Quick Lint') { + environment { + VIRTUAL_ENV = "${WORKSPACE}/.venv_lint" + PATH = "${WORKSPACE}/.venv_lint/bin:${HOME}/.local/bin:${env.PATH}" + } + stages { + stage('Setup Python') { + steps { + sh ''' + uv pip install -r requirements-lint.txt + uv pip install pipx + ''' + } + } + stage('Check AST with Python') { + steps { + sh ''' + python -m compileall -q -f tilelang + ''' + } + } + stage('Pre-commit Lint') { + steps { + sh ''' + if ! pipx run pre-commit run --all-files --color=always --show-diff-on-failure; then + echo "Pre-commit checks failed. Please run 'pre-commit run --all-files' locally to see the issues." + exit 1 + fi + ''' + } + } + } + post { + failure { + sh ''' + echo "Clearing uv cache at ${UV_CACHE_DIR} due to failure." + uv cache clean + ''' + } + } + } + stage("Test for Python 3.12 with Metax") { + environment { + VIRTUAL_ENV = "${WORKSPACE}/.venv_test" + PATH = "${WORKSPACE}/.venv_test/bin:${HOME}/.local/bin:${env.PATH}" + } + stages { + stage('Set environment (self-hosted runners)') { + steps { + echo "Self-hosted runner, skipping cache reconfiguration" + } + } + stage("Set environment Metax") { + steps { + script { + // Metax/MACA 环境变量配置 + env.TILELANG_HOME = "${WORKSPACE}" + env.TVM_ROOT = "${WORKSPACE}/3rdparty/tvm" + env.TVM_IMPORT_PYTHON_PATH = "${env.TVM_ROOT}/python" + env.PYTHONPATH = "${env.TILELANG_HOME}:${env.TVM_IMPORT_PYTHON_PATH}:${env.PYTHONPATH ?: ''}" + env.USE_MACA = "ON" + env.LD_LIBRARY_PATH = "/opt/maca/ompi/lib:/opt/maca/mxgpu_llvm/lib:/opt/maca/lib${env.LD_LIBRARY_PATH ? ':' + env.LD_LIBRARY_PATH : ''}" + } + } + } + stage('Setup Python and uv with caching') { + steps { + script { + env.UV_CACHE_DIR = "${WORKSPACE}/.cache/uv" + } + } + } + stage('Setup venv') { + steps { + sh ''' + pip3 config set global.index https://mirrors.aliyun.com/pypi + pip3 config set global.index-url https://mirrors.aliyun.com/pypi/simple + pip3 config set install.trusted-host mirrors.aliyun.com + + uv pip install --upgrade pip setuptools wheel + if [[ -n "${UV_INDEX:-}" ]]; then + uv pip install --prerelease=allow -v torch --index-url "${UV_INDEX}" || \ + uv pip install -v torch + else + uv pip install -v torch + fi + echo "import torch; print(f'torch: {torch.__version__}')" | uv run --no-project --script - + uv pip install -v -r requirements-test.txt + if [[ -f requirements-test-metax.txt ]]; then + uv pip install -v -r requirements-test-metax.txt + fi + echo "::group::torch.utils.collect_env" + uv run --no-project -m -- torch.utils.collect_env || true + echo "::endgroup::" + ''' + } + } + stage('Install project (wheel form)') { + steps { + sh ''' + uv pip install -v . + ''' + } + } + stage('Run clang-tidy') { + steps { + sh ''' + echo "\$ $(command -v clang-tidy) --version" && clang-tidy --version + + # Download run-clang-tidy script + RCT_URL=https://raw.githubusercontent.com/llvm/llvm-project/refs/heads/release/21.x/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py + echo "Downloading run-clang-tidy script from ${RCT_URL}" + echo "import urllib.request; url = '${RCT_URL}'.rstrip('/'); urllib.request.urlretrieve(url, url.split('/')[-1])" | uv run --no-project --script - + RUN_CLANG_TIDY=(uv run --no-project --script -- run-clang-tidy.py) + + if [[ -x "$(command -v clang-apply-replacements)" ]]; then + echo "Using clang-apply-replacements from $(command -v clang-apply-replacements)" + RUN_CLANG_TIDY+=(-fix -clang-apply-replacements-binary="$(command -v clang-apply-replacements)") + else + echo "::warning::clang-apply-replacements not found in PATH, automatic fixing disabled." + fi + + # Run cmake to create the build directory with compile_commands.json + cmake -S . -B cmake-build --fresh ${CLANG_TIDY_CMAKE_OPTIONS} # no quotes here + echo "::group::compile_commands.json" + ls -alh cmake-build/compile_commands.json + uv run --no-project -m -- json.tool --no-ensure-ascii cmake-build/compile_commands.json + echo "::endgroup::" + + CXX_FILES=$(find src -type f -iname "*.[ch]pp" -o -iname "*.cc" -o -iname "*.c" -o -iname "*.h") + rc=0 + echo "::group::run-clang-tidy" + "${RUN_CLANG_TIDY[@]}" -clang-tidy-binary="$(command -v clang-tidy)" \ + -exclude-header-filter='^(3rdparty|tvm)/.*$' \ + -p="cmake-build" ${CXX_FILES} || rc="$?" + echo "::endgroup::" + rm -rf cmake-build run-clang-tidy.py + if (( rc != 0 )); then + echo "::error::clang-tidy found issues (exit code: ${rc}). Please run 'clang-tidy --fix' locally to fix them." + git diff --color=always || true + exit "${rc}" + fi + ''' + } + } + stage('Run Metax tests with Python') { + steps { + sh ''' + #!/bin/bash + cd testing + PYTEST=( + uv run --no-project -m -- + pytest \ + --verbose \ + --color=yes --durations=0 --showlocals \ + --cache-clear \ + --junitxml=results.xml \ + ) + "${PYTEST[@]}" --maxfail=3 --numprocesses=4 \ + ./python + ''' + } + } + stage('List generated files') { + steps { + sh ''' + find . -type f -name '*.py[co]' -delete + find . -depth -type d -name "__pycache__" -exec rm -r "{}" + + if git status --ignored --porcelain | grep -qvE '/$'; then + ls -alh $(git status --ignored --porcelain | grep -vE '/$' | grep -oE '\\S+$') + fi + ''' + } + } + } + post { + always { + junit 'testing/results.xml' + } + failure { + sh ''' + uv cache clean + ''' + } + } + } + } + post { + success { + // updateGiteePushStatus(state: 'SUCCESS') + sh ''' + echo "Build and tests succeeded." + ''' + } + failure { + step([$class: 'GiteeConnectionStatusSetter', status: 'FAILURE']) + // sh ''' + // echo "Build or tests failed." + // ''' + } + } +} diff --git a/THIRDPARTYNOTICES.txt b/THIRDPARTYNOTICES.txt index 4b352b49bfa7d75e9bd99cc17c683d013a7746d7..6b1b6ef45af99f3ff1d917f2bccce6492aec2216 100644 --- a/THIRDPARTYNOTICES.txt +++ b/THIRDPARTYNOTICES.txt @@ -58,8 +58,8 @@ The following files are newly added by MetaX Integrated Circuits (Shanghai) Co., added: tilelang/carver/arch/maca.py added: tilelang/quantize/lop3_maca.py -BitBLAS uses third-party material as listed below. The attached notices are -provided for informational purposes only. +BitBLAS uses third-party material as listed below. The attached notices are +provided for informational purposes only. ======= BitBLAS uses third-party material as listed below. The attached notices are provided for informational purposes only. diff --git a/ci/build_and_test.sh b/ci/build_and_test.sh new file mode 100755 index 0000000000000000000000000000000000000000..ae6ae81a86457019979cb8b2768933cb78970762 --- /dev/null +++ b/ci/build_and_test.sh @@ -0,0 +1,27 @@ +#!/bin/bash +set -e +export TILELANG_HOME=$(pwd) +export TVM_ROOT=${TILELANG_HOME}/3rdparty/tvm +export PYTHONPATH=${TILELANG_HOME}:${TVM_ROOT}/python:${PYTHONPATH} +export USE_MACA=ON +export LD_LIBRARY_PATH=/opt/maca/ompi/lib:/opt/maca/mxgpu_llvm/lib:/opt/maca/lib:$LD_LIBRARY_PATH + +# 容器内环境初始化 +sed -i "s@http.*com/ubuntu@http://repo.metax-tech.com/r/ubuntu@g" /etc/apt/sources.list +apt-get update && apt-get install -y git cmake + +pip3 config set global.index-url https://repo.metax-tech.com/r/pypi/simple +pip3 config set install.trusted-host repo.metax-tech.com +pip install pytest-csv cython z3-solver==4.13.0 psutil cloudpickle torch-c-dlpack-ext + +# 编译 +git config --global --add safe.directory ${TILELANG_HOME} +mkdir -p build +cmake -B build -DUSE_MACA=ON +make -C build -j$(nproc) + +# 安装 ffi 并运行测试 +cd 3rdparty/tvm/3rdparty/tvm-ffi && pip install . +cd ${TILELANG_HOME} +python3 examples/quickstart.py +pytest --junitxml=report.xml testing/python/analysis/test_tilelang_nested_loop_checker.py diff --git a/docs/get_started/Installation_maca.md b/docs/get_started/Installation_maca.md index ce742b1742a45282ee0a2e8181bf6108ac368886..3818c722d4226f0252323e407f4c953d86fd93ee 100644 --- a/docs/get_started/Installation_maca.md +++ b/docs/get_started/Installation_maca.md @@ -70,7 +70,7 @@ Check out [Installing Docker Engine](https://docs.docker.com/engine/install/ubun Check out [Metax docker](https://sw-download.metax-tech.com/docker) and download pytorch image. ``` bash -docker login --username=cr_temp_user --password=eyJpbnN0YW5jZUlkIjoiY3JpLXpxYTIzejI2YTU5M3R3M2QiLCJ0aW1lIjoiMTc3MDg5NTI0MzAwMCIsInR5cGUiOiJzdWIiLCJ1c2VySWQiOiIyMDcwOTQwMTA1NjYzNDE3OTIifQ:91ecedb8bd5c4af6858745f0329d069263e1bf82 cr.metax-tech.com && docker pull cr.metax-tech.com/public-library/maca-pytorch:3.3.0.4-torch2.6-py310-ubuntu24.04-amd64 +docker login --username=cr_temp_user --password=eyJpbnN0YW5jZUlkIjoiY3JpLXpxYTIzejI2YTU5M3R3M2QiLCJ0aW1lIjoiMTc3MDg5NTI0MzAwMCIsInR5cGUiOiJzdWIiLCJ1c2VySWQiOiIyMDcwOTQwMTA1NjYzNDE3OTIifQ:91ecedb8bd5c4af6858745f0329d069263e1bf82 cr.metax-tech.com && docker pull cr.metax-tech.com/public-library/maca-pytorch:3.3.0.4-torch2.6-py310-ubuntu24.04-amd64 docker run -it --net=host --device=/dev/dri --device=/dev/mxcd --group-add video --name mctilelang cr.metax-tech.com/public-library/maca-pytorch:3.3.0.4-torch2.6-py310-ubuntu24.04-amd64 /bin/bash @@ -124,4 +124,4 @@ export PYTHONPATH=/path/to/mcTileLang:$PYTHONPATH python -c "import tilelang; print(tilelang.__version__)" python path/to/mcTileLang/examples/quickstart.py -``` \ No newline at end of file +``` diff --git a/examples/deepseek_mla/test_example_mla_decode.py b/examples/deepseek_mla/test_example_mla_decode.py index 887f142ca52448db04ea992f8c817e4de098a278..c9b7cb7ff9ec7da5955f398c8da282718bcb6155 100644 --- a/examples/deepseek_mla/test_example_mla_decode.py +++ b/examples/deepseek_mla/test_example_mla_decode.py @@ -3,6 +3,7 @@ import tilelang.testing import example_mla_decode + def test_example_mla_decode(): example_mla_decode.main() diff --git a/examples/gemm/test_example_gemm.py b/examples/gemm/test_example_gemm.py index 88873be0381d847201a03b3bd67891b769e4b73d..edeb3c81c2da4a5cba2517ad9b2c9ba443ae1876 100644 --- a/examples/gemm/test_example_gemm.py +++ b/examples/gemm/test_example_gemm.py @@ -1,5 +1,6 @@ # 2025 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved. +import pytest import tilelang.testing import example_gemm_autotune import example_gemm_intrinsics diff --git a/requirements-test-metax.txt b/requirements-test-metax.txt new file mode 100644 index 0000000000000000000000000000000000000000..f379d9eeb3ae971167193fe205e0a3d96f64675d --- /dev/null +++ b/requirements-test-metax.txt @@ -0,0 +1,5 @@ +# Lint requirements +--requirement requirements-lint.txt + +# Common test requirements +--requirement requirements-test.txt diff --git a/src/layout/gemm_layouts.cc b/src/layout/gemm_layouts.cc index 0193479c5c08cd33223ed5af7fb40d56a83af2a3..4164b638e5e035c02f86fe457a229befb53d66cf 100644 --- a/src/layout/gemm_layouts.cc +++ b/src/layout/gemm_layouts.cc @@ -1,4 +1,5 @@ -// 2025 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved. +// 2025 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights +// Reserved. /*! * \file layout/gemm_layouts.cc * \brief Define Layout used in MMA and other operations. @@ -349,7 +350,8 @@ Fragment makeGemmFragmentAMACA(const int block_m, const int block_n, if (transposed) { PrimExpr forward_thread = 16 * FloorDiv(i->var, 4) + j; PrimExpr index = FloorMod(i->var, 4); - auto base_layout = Fragment({i, j}, {index}, forward_thread, rep)->Repeat({1, 1}, false, false); + auto base_layout = Fragment({i, j}, {index}, forward_thread, rep) + ->Repeat({1, 1}, false, false); auto warp_layout = base_layout->Repeat({1, block_m / warp_m}, true, false) ->Replicate(block_n / warp_n); auto block_layout = @@ -358,7 +360,8 @@ Fragment makeGemmFragmentAMACA(const int block_m, const int block_n, } else { PrimExpr forward_thread = 16 * FloorDiv(j->var, 4) + i; PrimExpr index = FloorMod(j->var, 4); - auto base_layout = Fragment({i, j}, {index}, forward_thread, rep)->Repeat({1, 1}, false, false); + auto base_layout = Fragment({i, j}, {index}, forward_thread, rep) + ->Repeat({1, 1}, false, false); auto warp_layout = base_layout->Repeat({block_m / warp_m, 1}, true) ->Replicate(block_n / warp_n); auto block_layout = @@ -993,7 +996,8 @@ Layout makeGemmABLayoutMACA(int mat_stride, int mat_continuous, int continuity, return makeGemmABLayoutPadded(mat_stride, mat_continuous, element_size); } else if (mat_continuous % (vector_size * 8) == 0) { if (mat_stride % 64 == 32) { - return MakeFullBankSwizzleLayout2D(mat_stride, mat_continuous, element_size); + return MakeFullBankSwizzleLayout2D(mat_stride, mat_continuous, + element_size); } Var i = InputPlaceholder(0); Var j = InputPlaceholder(1); @@ -1007,7 +1011,8 @@ Layout makeGemmABLayoutMACA(int mat_stride, int mat_continuous, int continuity, PrimExpr index = vec + (c_swizzle + s * 16) * vector_size; return Layout(Array{mat_stride, mat_continuous}, {tc, ts, index}); } else if (mat_continuous % (vector_size * 4) == 0) { - return MakeHalfBankSwizzleLayout2D(mat_stride, mat_continuous, element_size); + return MakeHalfBankSwizzleLayout2D(mat_stride, mat_continuous, + element_size); } else { ICHECK(0); return makeGemmABLayoutPadded(mat_stride, mat_continuous, element_size); diff --git a/src/layout/layout.h b/src/layout/layout.h index 99b5edeeb239ee700f449eab65dc0d054026476d..fa0a9af1e65b69d7600220a5555834165143c53a 100644 --- a/src/layout/layout.h +++ b/src/layout/layout.h @@ -1,4 +1,5 @@ -// 2025 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved. +// 2025 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights +// Reserved. /*! * \file Layout.h * diff --git a/src/op/gemm.cc b/src/op/gemm.cc index a488742124b557c56523c13c7b351562869a6a93..714213805e65ae0ba01b5ee811c315e4cb2819a6 100644 --- a/src/op/gemm.cc +++ b/src/op/gemm.cc @@ -1,4 +1,5 @@ -// 2025 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved. +// 2025 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights +// Reserved. /*! * \file tl/op/gemm.cc * \brief Implementation of General Matrix Multiplication (GEMM) operators diff --git a/src/op/logical.cc b/src/op/logical.cc index e8ac23040e3b5cc4aa46baedafbbdf09b4fa47dd..09906d229b850b06a2b8b0b0256a8398d0c7cef4 100644 --- a/src/op/logical.cc +++ b/src/op/logical.cc @@ -1,4 +1,5 @@ -// 2025 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved. +// 2025 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights +// Reserved. /*! * \file tl/op/logical.cc * \brief Logical operations. diff --git a/src/target/codegen_maca.cc b/src/target/codegen_maca.cc index 6e8ae676c5e1139777486d8966b4901f01725b3c..e3b00a0076c64a9e7012d890c2a581061f09e28d 100644 --- a/src/target/codegen_maca.cc +++ b/src/target/codegen_maca.cc @@ -1,4 +1,5 @@ -// Copyright (c) 2025 MetaX Integrated Circuits (Shanghai) Co., Ltd. All rights reserved. +// Copyright (c) 2025 MetaX Integrated Circuits (Shanghai) Co., Ltd. All rights +// reserved. /*! * \file target/codegen.cc @@ -715,8 +716,8 @@ void CodeGenTileLangMACA::PrintVecElemStore(const std::string &vec, DataType t, } } else if (t.is_bfloat16()) { if (t.lanes() <= 8) { - stream << "((maca_bfloat162*)(&(" << vec << "." << access[i / 2] << ")))->" - << access[i % 2] << " = " << value << ";\n"; + stream << "((maca_bfloat162*)(&(" << vec << "." << access[i / 2] + << ")))->" << access[i % 2] << " = " << value << ";\n"; } else { stream << "(((maca_bfloat162*)(&(" << vec << "." << access[i / 4] << "))) + " << (i / 2 % 2) << ")->" << access[i % 2] << " = " @@ -937,8 +938,8 @@ void CodeGenTileLangMACA::VisitExpr_(const CastNode *op, std::ostream &os) { if (from_ty.is_bfloat16() && target_ty.is_float() && target_ty.bits() == 32) { // Use __bfloat1622float2 for vectorized conversion (bfloat162 -> float2) if (lanes == 2 || lanes == 4 || lanes == 8) { - PrintVectorizedCast("__bfloat1622float2", "__maca_bfloat162", "float2", "", - true, false); + PrintVectorizedCast("__bfloat1622float2", "__maca_bfloat162", "float2", + "", true, false); return; } } @@ -958,9 +959,11 @@ void CodeGenTileLangMACA::VisitExpr_(const CastNode *op, std::ostream &os) { tl::IsCudaVectorizableFP8(target_ty)) { bool target_type_is_e4m3 = target_ty.is_float8_e4m3() || target_ty.is_float8_e4m3fn(); - std::string type_suffix = target_type_is_e4m3 ? "__MACA_E4M3" : "__MACA_E5M2"; + std::string type_suffix = + target_type_is_e4m3 ? "__MACA_E4M3" : "__MACA_E5M2"; - // Use __maca_cvt_float2_to_fp8x2 for vectorized conversion (float2 -> fp8x2) + // Use __maca_cvt_float2_to_fp8x2 for vectorized conversion (float2 -> + // fp8x2) if (lanes == 2 || lanes == 4 || lanes == 8) { std::string extra_args = ", __MACA_SATFINITE, " + type_suffix; PrintVectorizedCast("__maca_cvt_float2_to_fp8x2", "float2", @@ -990,8 +993,8 @@ void CodeGenTileLangMACA::VisitExpr_(const CastNode *op, std::ostream &os) { // bfloat162) if (lanes == 2 || lanes == 4 || lanes == 8) { PrintVectorizedCast("__tl_cvt_e8m0x2_to_bfloat162", - "__maca_fp8x2_storage_t", "__maca_bfloat162", "", true, - false); + "__maca_fp8x2_storage_t", "__maca_bfloat162", "", + true, false); return; } } diff --git a/src/target/codegen_maca.h b/src/target/codegen_maca.h index 7a88a374b00e5053c3f3c685a64a9bea6cadee04..72ffeae4f911e249ec99363698b4677e2ad2cd4c 100644 --- a/src/target/codegen_maca.h +++ b/src/target/codegen_maca.h @@ -1,4 +1,5 @@ -// Copyright (c) 2025 MetaX Integrated Circuits (Shanghai) Co., Ltd. All rights reserved. +// Copyright (c) 2025 MetaX Integrated Circuits (Shanghai) Co., Ltd. All rights +// reserved. /*! * \file target/codegen.h diff --git a/src/target/intrin_rule_maca.cc b/src/target/intrin_rule_maca.cc index 37d578a2b6746dbe4337a901d75b54540abf65f3..ca27d11adb0d439efb5d0da933e1c71f711c9f1e 100644 --- a/src/target/intrin_rule_maca.cc +++ b/src/target/intrin_rule_maca.cc @@ -36,32 +36,32 @@ struct MACAMath { std::string operator()(DataType t, std::string name) const { if (t.is_float()) { switch (t.bits()) { - case 64: - return name; - case 32: - return name + 'f'; - case 16: { - if (name == "fabs") { - return "__habs"; - } else if (name == "round") { - return "hrint"; - } else { - return "h" + name; - } + case 64: + return name; + case 32: + return name + 'f'; + case 16: { + if (name == "fabs") { + return "__habs"; + } else if (name == "round") { + return "hrint"; + } else { + return "h" + name; } - default: - return ""; + } + default: + return ""; } } else if (t.is_bfloat16()) { return 'h' + name; } else if (t.is_int() || t.is_uint()) { switch (t.bits()) { - case 32: - return "__" + name; - case 64: - return "__" + name + "ll"; - default: - return ""; + case 32: + return "__" + name; + case 64: + return "__" + name + "ll"; + default: + return ""; } } return ""; @@ -83,16 +83,16 @@ struct MACAFastMathTan : public MACAMath { std::string operator()(DataType t, std::string name) const { if (t.is_float()) { switch (t.bits()) { - case 64: - return name; - // `__tanf` seems to produce some values too deviant from numpy tan version. - // So, let's use just `tanf` instead. - case 32: - return name + 'f'; - case 16: - return 'h' + name; - default: - return ""; + case 64: + return name; + // `__tanf` seems to produce some values too deviant from numpy tan + // version. So, let's use just `tanf` instead. + case 32: + return name + 'f'; + case 16: + return 'h' + name; + default: + return ""; } } return ""; @@ -103,12 +103,12 @@ struct MACAPopcount { std::string operator()(DataType t, std::string name) const { if (t.is_uint()) { switch (t.bits()) { - case 32: - return "__popc"; - case 64: - return "__popcll"; - default: - return ""; + case 32: + return "__popc"; + case 64: + return "__popcll"; + default: + return ""; } } return ""; @@ -116,7 +116,7 @@ struct MACAPopcount { }; struct MACAWarpIntrinsic { - const Op operator()(DataType t, const Op& orig_op) const { + const Op operator()(DataType t, const Op &orig_op) const { if (orig_op.same_as(builtin::tvm_warp_shuffle())) { return Op::Get("tir.maca.__shfl_sync"); } else if (orig_op.same_as(builtin::tvm_warp_shuffle_up())) { @@ -128,117 +128,142 @@ struct MACAWarpIntrinsic { } }; -static PrimExpr DispatchMACAWarpActiveMask(const PrimExpr& e) { - const CallNode* call = e.as(); +static PrimExpr DispatchMACAWarpActiveMask(const PrimExpr &e) { + const CallNode *call = e.as(); return Call(call->dtype, Op::Get("tir.maca.__activemask"), call->args); } -template -static PrimExpr DispatchMACAShuffle(const PrimExpr& e) { - const CallNode* call = e.as(); +template static PrimExpr DispatchMACAShuffle(const PrimExpr &e) { + const CallNode *call = e.as(); ICHECK(call != nullptr); - ICHECK_EQ(call->args.size(), 5); // mask, value, warp_id, width, warp_size - ffi::Array maca_args{{call->args[0], call->args[1], call->args[2], call->args[3]}}; + ICHECK_EQ(call->args.size(), 5); // mask, value, warp_id, width, warp_size + ffi::Array maca_args{ + {call->args[0], call->args[1], call->args[2], call->args[3]}}; return Call(call->dtype, T()(call->dtype, Downcast(call->op)), maca_args); } TVM_REGISTER_OP("tir.clz").set_attr( - "maca.FLowerIntrinsic", DispatchPureExtern); + "maca.FLowerIntrinsic", + DispatchPureExtern); TVM_REGISTER_OP("tir.floor") - .set_attr("maca.FLowerIntrinsic", DispatchPureExtern); + .set_attr("maca.FLowerIntrinsic", + DispatchPureExtern); TVM_REGISTER_OP("tir.ceil") - .set_attr("maca.FLowerIntrinsic", DispatchPureExtern); + .set_attr("maca.FLowerIntrinsic", + DispatchPureExtern); TVM_REGISTER_OP("tir.trunc") - .set_attr("maca.FLowerIntrinsic", DispatchPureExtern); + .set_attr("maca.FLowerIntrinsic", + DispatchPureExtern); TVM_REGISTER_OP("tir.fabs") - .set_attr("maca.FLowerIntrinsic", DispatchPureExtern); + .set_attr("maca.FLowerIntrinsic", + DispatchPureExtern); TVM_REGISTER_OP("tir.round") - .set_attr("maca.FLowerIntrinsic", DispatchPureExtern); + .set_attr("maca.FLowerIntrinsic", + DispatchPureExtern); TVM_REGISTER_OP("tir.nearbyint") - .set_attr("maca.FLowerIntrinsic", DispatchPureExtern); + .set_attr("maca.FLowerIntrinsic", + DispatchPureExtern); -TVM_REGISTER_OP("tir.exp").set_attr("maca.FLowerIntrinsic", - DispatchPureExtern); +TVM_REGISTER_OP("tir.exp").set_attr( + "maca.FLowerIntrinsic", DispatchPureExtern); TVM_REGISTER_OP("tir.exp2") - .set_attr("maca.FLowerIntrinsic", DispatchPureExtern); + .set_attr("maca.FLowerIntrinsic", + DispatchPureExtern); TVM_REGISTER_OP("tir.exp10") - .set_attr("maca.FLowerIntrinsic", DispatchPureExtern); + .set_attr("maca.FLowerIntrinsic", + DispatchPureExtern); -TVM_REGISTER_OP("tir.erf").set_attr("maca.FLowerIntrinsic", - DispatchPureExtern); +TVM_REGISTER_OP("tir.erf").set_attr( + "maca.FLowerIntrinsic", DispatchPureExtern); -TVM_REGISTER_OP("tir.log").set_attr("maca.FLowerIntrinsic", - DispatchPureExtern); +TVM_REGISTER_OP("tir.log").set_attr( + "maca.FLowerIntrinsic", DispatchPureExtern); TVM_REGISTER_OP("tir.log2") - .set_attr("maca.FLowerIntrinsic", DispatchPureExtern); + .set_attr("maca.FLowerIntrinsic", + DispatchPureExtern); TVM_REGISTER_OP("tir.log10") - .set_attr("maca.FLowerIntrinsic", DispatchPureExtern); + .set_attr("maca.FLowerIntrinsic", + DispatchPureExtern); -TVM_REGISTER_OP("tir.tan").set_attr("maca.FLowerIntrinsic", - DispatchPureExtern); +TVM_REGISTER_OP("tir.tan").set_attr( + "maca.FLowerIntrinsic", DispatchPureExtern); -TVM_REGISTER_OP("tir.cos").set_attr("maca.FLowerIntrinsic", - DispatchPureExtern); +TVM_REGISTER_OP("tir.cos").set_attr( + "maca.FLowerIntrinsic", DispatchPureExtern); TVM_REGISTER_OP("tir.cosh") - .set_attr("maca.FLowerIntrinsic", DispatchPureExtern); + .set_attr("maca.FLowerIntrinsic", + DispatchPureExtern); -TVM_REGISTER_OP("tir.sin").set_attr("maca.FLowerIntrinsic", - DispatchPureExtern); +TVM_REGISTER_OP("tir.sin").set_attr( + "maca.FLowerIntrinsic", DispatchPureExtern); TVM_REGISTER_OP("tir.sinh") - .set_attr("maca.FLowerIntrinsic", DispatchPureExtern); + .set_attr("maca.FLowerIntrinsic", + DispatchPureExtern); TVM_REGISTER_OP("tir.atan") - .set_attr("maca.FLowerIntrinsic", DispatchPureExtern); + .set_attr("maca.FLowerIntrinsic", + DispatchPureExtern); TVM_REGISTER_OP("tir.tanh") - .set_attr("maca.FLowerIntrinsic", DispatchPureExtern); + .set_attr("maca.FLowerIntrinsic", + DispatchPureExtern); TVM_REGISTER_OP("tir.sqrt") - .set_attr("maca.FLowerIntrinsic", DispatchPureExtern); + .set_attr("maca.FLowerIntrinsic", + DispatchPureExtern); -TVM_REGISTER_OP("tir.pow").set_attr("maca.FLowerIntrinsic", - DispatchPureExtern); +TVM_REGISTER_OP("tir.pow").set_attr( + "maca.FLowerIntrinsic", DispatchPureExtern); TVM_REGISTER_OP("tir.popcount") - .set_attr("maca.FLowerIntrinsic", DispatchPureExtern); + .set_attr("maca.FLowerIntrinsic", + DispatchPureExtern); TVM_REGISTER_OP("tir.tvm_warp_shuffle") - .set_attr("maca.FLowerIntrinsic", DispatchMACAShuffle); + .set_attr("maca.FLowerIntrinsic", + DispatchMACAShuffle); TVM_REGISTER_OP("tir.tvm_warp_shuffle_up") - .set_attr("maca.FLowerIntrinsic", DispatchMACAShuffle); + .set_attr("maca.FLowerIntrinsic", + DispatchMACAShuffle); TVM_REGISTER_OP("tir.tvm_warp_shuffle_down") - .set_attr("maca.FLowerIntrinsic", DispatchMACAShuffle); + .set_attr("maca.FLowerIntrinsic", + DispatchMACAShuffle); TVM_REGISTER_OP("tir.tvm_warp_activemask") - .set_attr("maca.FLowerIntrinsic", DispatchMACAWarpActiveMask); + .set_attr("maca.FLowerIntrinsic", + DispatchMACAWarpActiveMask); TVM_REGISTER_OP("tir.fmod") - .set_attr("maca.FLowerIntrinsic", DispatchPureExtern); + .set_attr("maca.FLowerIntrinsic", + DispatchPureExtern); // Register low-level builtin ops. -// TODO(tvm-team): consider make MACA its own subfolder and create a file for low-level builtins. +// TODO(tvm-team): consider make MACA its own subfolder and create a file for +// low-level builtins. TVM_REGISTER_OP("tir.maca.__shfl_sync") .set_num_inputs(4) .add_argument("mask", "Expr", "The thread mask.") .add_argument("var", "Expr", "The variable to sync.") .add_argument("lane", "Expr", "The source thread id.") - .add_argument("width", "Expr", "The warp thread width, must be a power of 2.") + .add_argument("width", "Expr", + "The warp thread width, must be a power of 2.") .set_attr("TGlobalSymbol", "__shfl_sync") - .set_attr("TCallEffectKind", Integer(CallEffectKind::kOpaque)) + .set_attr("TCallEffectKind", + Integer(CallEffectKind::kOpaque)) .set_attr("maca.need_warp_shuffle", true); TVM_REGISTER_OP("tir.maca.__shfl_up_sync") @@ -246,27 +271,33 @@ TVM_REGISTER_OP("tir.maca.__shfl_up_sync") .add_argument("mask", "Expr", "The thread mask.") .add_argument("var", "Expr", "The variable to sync.") .add_argument("delta", "Expr", "The source lane id offset to be added.") - .add_argument("width", "Expr", "The warp thread width, must be a power of 2.") + .add_argument("width", "Expr", + "The warp thread width, must be a power of 2.") .set_attr("TGlobalSymbol", "__shfl_up_sync") - .set_attr("TCallEffectKind", Integer(CallEffectKind::kOpaque)) + .set_attr("TCallEffectKind", + Integer(CallEffectKind::kOpaque)) .set_attr("maca.need_warp_shuffle", true); TVM_REGISTER_OP("tir.maca.__shfl_down_sync") .set_num_inputs(4) .add_argument("mask", "Expr", "The thread mask.") .add_argument("var", "Expr", "The variable to sync.") - .add_argument("delta", "Expr", "The source lane id offset to be subtracted.") - .add_argument("width", "Expr", "The warp thread width, must be a power of 2.") + .add_argument("delta", "Expr", + "The source lane id offset to be subtracted.") + .add_argument("width", "Expr", + "The warp thread width, must be a power of 2.") .set_attr("TGlobalSymbol", "__shfl_down_sync") - .set_attr("TCallEffectKind", Integer(CallEffectKind::kOpaque)) + .set_attr("TCallEffectKind", + Integer(CallEffectKind::kOpaque)) .set_attr("maca.need_warp_shuffle", true); TVM_REGISTER_OP("tir.maca.__activemask") .set_num_inputs(0) .set_attr("TGlobalSymbol", "__activemask") - .set_attr("TCallEffectKind", Integer(CallEffectKind::kPure)) + .set_attr("TCallEffectKind", + Integer(CallEffectKind::kPure)) .set_attr("maca.need_warp_shuffle", true); -} // namespace intrin -} // namespace codegen -} // namespace tvm +} // namespace intrin +} // namespace codegen +} // namespace tvm diff --git a/src/target/maca_common.h b/src/target/maca_common.h index 0ac54353a25de83d45d53785fafe60a56cbd86c6..3f5322b558249d79c4331faa020bf017059ff8bf 100644 --- a/src/target/maca_common.h +++ b/src/target/maca_common.h @@ -34,23 +34,24 @@ namespace tvm { namespace runtime { -#define MACA_DRIVER_CALL(x) \ - { \ - mcError_t result = x; \ - if (result != mcSuccess && result != mcErrorDeinitialized) { \ - LOG(FATAL) << "MACA MACA Error: " #x " failed with error: " << mcGetErrorString(result); \ - } \ +#define MACA_DRIVER_CALL(x) \ + { \ + mcError_t result = x; \ + if (result != mcSuccess && result != mcErrorDeinitialized) { \ + LOG(FATAL) << "MACA MACA Error: " #x " failed with error: " \ + << mcGetErrorString(result); \ + } \ } -#define MACA_CALL(func) \ - { \ - mcError_t e = (func); \ - ICHECK(e == mcSuccess) << "MACA MACA: " << mcGetErrorString(e); \ +#define MACA_CALL(func) \ + { \ + mcError_t e = (func); \ + ICHECK(e == mcSuccess) << "MACA MACA: " << mcGetErrorString(e); \ } /*! \brief Thread local workspace */ class MACAThreadEntry { - public: +public: /*! \brief The maca stream */ mcStream_t stream{nullptr}; /*! \brief thread local pool*/ @@ -58,8 +59,8 @@ class MACAThreadEntry { /*! \brief constructor */ MACAThreadEntry(); // get the threadlocal workspace - static MACAThreadEntry* ThreadLocal(); + static MACAThreadEntry *ThreadLocal(); }; -} // namespace runtime -} // namespace tvm -#endif // TVM_RUNTIME_MACA_MACA_COMMON_H_ +} // namespace runtime +} // namespace tvm +#endif // TVM_RUNTIME_MACA_MACA_COMMON_H_ diff --git a/src/target/maca_device_api.cc b/src/target/maca_device_api.cc index c9212acf81b42e754810bb1814dfd6d4ef69e322..2bad3e9acf6c179159af8dc7c3d6a7234fa4e110 100644 --- a/src/target/maca_device_api.cc +++ b/src/target/maca_device_api.cc @@ -41,109 +41,116 @@ namespace tvm { namespace runtime { class MACADeviceAPI final : public DeviceAPI { - public: +public: void SetDevice(Device dev) final { MACA_CALL(mcSetDevice(dev.device_id)); } - void GetAttr(Device device, DeviceAttrKind kind, ffi::Any* rv) final { + void GetAttr(Device device, DeviceAttrKind kind, ffi::Any *rv) final { int value = 0; switch (kind) { - case kExist: { - int count; - auto err = mcGetDeviceCount(&count); - value = (err == mcSuccess && static_cast(count > device.device_id)); - break; - } - case kMaxThreadsPerBlock: { - MACA_CALL( - mcDeviceGetAttribute(&value, mcDeviceAttributeMaxThreadsPerBlock, device.device_id)); - break; - } - case kWarpSize: { - MACA_CALL(mcDeviceGetAttribute(&value, mcDeviceAttributeWarpSize, device.device_id)); - break; - } - case kMaxSharedMemoryPerBlock: { - MACA_CALL(mcDeviceGetAttribute(&value, mcDeviceAttributeMaxSharedMemoryPerBlock, - device.device_id)); - break; - } - case kComputeVersion: { - std::ostringstream os; - MACA_CALL(mcDeviceGetAttribute(&value, mcDeviceAttributeComputeCapabilityMajor, - device.device_id)); - os << value << "."; - MACA_CALL(mcDeviceGetAttribute(&value, mcDeviceAttributeComputeCapabilityMinor, - device.device_id)); - os << value; - *rv = os.str(); - return; - } - case kDeviceName: { - std::string name(256, 0); - MACA_CALL(mcDeviceGetName(&name[0], name.size(), device.device_id)); - name.resize(strlen(name.c_str())); - *rv = std::move(name); - return; - } - case kMaxClockRate: { - MACA_CALL(mcDeviceGetAttribute(&value, mcDeviceAttributeClockRate, device.device_id)); - break; - } - case kMultiProcessorCount: { - MACA_CALL( - mcDeviceGetAttribute(&value, mcDeviceAttributeMultiProcessorCount, device.device_id)); - break; - } - case kMaxThreadDimensions: { - int dims[3]; - MACA_CALL(mcDeviceGetAttribute(&dims[0], mcDeviceAttributeMaxBlockDimX, device.device_id)); - MACA_CALL(mcDeviceGetAttribute(&dims[1], mcDeviceAttributeMaxBlockDimY, device.device_id)); - MACA_CALL(mcDeviceGetAttribute(&dims[2], mcDeviceAttributeMaxBlockDimZ, device.device_id)); + case kExist: { + int count; + auto err = mcGetDeviceCount(&count); + value = (err == mcSuccess && static_cast(count > device.device_id)); + break; + } + case kMaxThreadsPerBlock: { + MACA_CALL(mcDeviceGetAttribute( + &value, mcDeviceAttributeMaxThreadsPerBlock, device.device_id)); + break; + } + case kWarpSize: { + MACA_CALL(mcDeviceGetAttribute(&value, mcDeviceAttributeWarpSize, + device.device_id)); + break; + } + case kMaxSharedMemoryPerBlock: { + MACA_CALL(mcDeviceGetAttribute( + &value, mcDeviceAttributeMaxSharedMemoryPerBlock, device.device_id)); + break; + } + case kComputeVersion: { + std::ostringstream os; + MACA_CALL(mcDeviceGetAttribute( + &value, mcDeviceAttributeComputeCapabilityMajor, device.device_id)); + os << value << "."; + MACA_CALL(mcDeviceGetAttribute( + &value, mcDeviceAttributeComputeCapabilityMinor, device.device_id)); + os << value; + *rv = os.str(); + return; + } + case kDeviceName: { + std::string name(256, 0); + MACA_CALL(mcDeviceGetName(&name[0], name.size(), device.device_id)); + name.resize(strlen(name.c_str())); + *rv = std::move(name); + return; + } + case kMaxClockRate: { + MACA_CALL(mcDeviceGetAttribute(&value, mcDeviceAttributeClockRate, + device.device_id)); + break; + } + case kMultiProcessorCount: { + MACA_CALL(mcDeviceGetAttribute( + &value, mcDeviceAttributeMultiProcessorCount, device.device_id)); + break; + } + case kMaxThreadDimensions: { + int dims[3]; + MACA_CALL(mcDeviceGetAttribute(&dims[0], mcDeviceAttributeMaxBlockDimX, + device.device_id)); + MACA_CALL(mcDeviceGetAttribute(&dims[1], mcDeviceAttributeMaxBlockDimY, + device.device_id)); + MACA_CALL(mcDeviceGetAttribute(&dims[2], mcDeviceAttributeMaxBlockDimZ, + device.device_id)); - std::stringstream ss; - ss << "[" << dims[0] << ", " << dims[1] << ", " << dims[2] << "]"; - *rv = ss.str(); - return; - } - case kMaxRegistersPerBlock: - MACA_CALL( - mcDeviceGetAttribute(&value, mcDeviceAttributeMaxRegistersPerBlock, device.device_id)); - break; - case kGcnArch: - return; - case kApiVersion: { - // *rv = MACA_VERSION; - return; - } - case kDriverVersion: - return; - case kL2CacheSizeBytes: { - // Get size of device l2 cache size in bytes. - int l2_size; - MACA_CALL(mcDeviceGetAttribute(&l2_size, mcDeviceAttributeL2CacheSize, device.device_id)); - *rv = l2_size; - return; - } - case kTotalGlobalMemory: { - mcDeviceProp_t prop; - MACA_CALL(mcGetDeviceProperties(&prop, device.device_id)); - int64_t total_global_memory = prop.totalGlobalMem; - *rv = total_global_memory; - return; - } - case kAvailableGlobalMemory: { - size_t free_mem, total_mem; - MACA_CALL(mcMemGetInfo(&free_mem, &total_mem)); - *rv = static_cast(free_mem); - return; - } - case kImagePitchAlignment: - return; + std::stringstream ss; + ss << "[" << dims[0] << ", " << dims[1] << ", " << dims[2] << "]"; + *rv = ss.str(); + return; + } + case kMaxRegistersPerBlock: + MACA_CALL(mcDeviceGetAttribute( + &value, mcDeviceAttributeMaxRegistersPerBlock, device.device_id)); + break; + case kGcnArch: + return; + case kApiVersion: { + // *rv = MACA_VERSION; + return; + } + case kDriverVersion: + return; + case kL2CacheSizeBytes: { + // Get size of device l2 cache size in bytes. + int l2_size; + MACA_CALL(mcDeviceGetAttribute(&l2_size, mcDeviceAttributeL2CacheSize, + device.device_id)); + *rv = l2_size; + return; + } + case kTotalGlobalMemory: { + mcDeviceProp_t prop; + MACA_CALL(mcGetDeviceProperties(&prop, device.device_id)); + int64_t total_global_memory = prop.totalGlobalMem; + *rv = total_global_memory; + return; + } + case kAvailableGlobalMemory: { + size_t free_mem, total_mem; + MACA_CALL(mcMemGetInfo(&free_mem, &total_mem)); + *rv = static_cast(free_mem); + return; + } + case kImagePitchAlignment: + return; } *rv = value; } - void* AllocDataSpace(Device dev, size_t nbytes, size_t alignment, DLDataType type_hint) final { + void *AllocDataSpace(Device dev, size_t nbytes, size_t alignment, + DLDataType type_hint) final { ICHECK_EQ(256 % alignment, 0U) << "MACA space is aligned at 256 bytes"; - void* ret; + void *ret; if (dev.device_type == kDLMACAHost) { VLOG(1) << "allocating " << nbytes << "bytes on host"; MACA_CALL(mcMallocHost(&ret, nbytes)); @@ -155,7 +162,7 @@ class MACADeviceAPI final : public DeviceAPI { return ret; } - void FreeDataSpace(Device dev, void* ptr) final { + void FreeDataSpace(Device dev, void *ptr) final { if (dev.device_type == kDLMACAHost) { MACA_CALL(mcFreeHost(ptr)); } else { @@ -165,12 +172,13 @@ class MACADeviceAPI final : public DeviceAPI { } protected: - void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size, - Device dev_from, Device dev_to, DLDataType type_hint, + void CopyDataFromTo(const void *from, size_t from_offset, void *to, + size_t to_offset, size_t size, Device dev_from, + Device dev_to, DLDataType type_hint, TVMStreamHandle stream) final { mcStream_t maca_stream = static_cast(stream); - from = static_cast(from) + from_offset; - to = static_cast(to) + to_offset; + from = static_cast(from) + from_offset; + to = static_cast(to) + to_offset; if (dev_from.device_type == kDLMACAHost) { dev_from.device_type = kDLCPU; } @@ -183,13 +191,15 @@ protected: if (dev_from.device_id == dev_to.device_id) { GPUCopy(from, to, size, mcMemcpyDeviceToDevice, maca_stream); } else { - MACA_CALL( - mcMemcpyPeerAsync(to, dev_to.device_id, from, dev_from.device_id, size, maca_stream)); + MACA_CALL(mcMemcpyPeerAsync(to, dev_to.device_id, from, + dev_from.device_id, size, maca_stream)); } - } else if (dev_from.device_type == kDLMACA && dev_to.device_type == kDLCPU) { + } else if (dev_from.device_type == kDLMACA && + dev_to.device_type == kDLCPU) { MACA_CALL(mcSetDevice(dev_from.device_id)); GPUCopy(from, to, size, mcMemcpyDeviceToHost, maca_stream); - } else if (dev_from.device_type == kDLCPU && dev_to.device_type == kDLMACA) { + } else if (dev_from.device_type == kDLCPU && + dev_to.device_type == kDLMACA) { MACA_CALL(mcSetDevice(dev_to.device_id)); GPUCopy(from, to, size, mcMemcpyHostToDevice, maca_stream); } else { @@ -211,7 +221,8 @@ public: MACA_CALL(mcStreamDestroy(mc_stream)); } - void SyncStreamFromTo(Device dev, TVMStreamHandle event_src, TVMStreamHandle event_dst) { + void SyncStreamFromTo(Device dev, TVMStreamHandle event_src, + TVMStreamHandle event_dst) { MACA_CALL(mcSetDevice(dev.device_id)); mcStream_t src_stream = static_cast(event_src); mcStream_t dst_stream = static_cast(event_dst); @@ -227,22 +238,22 @@ public: MACA_CALL(mcStreamSynchronize(static_cast(stream))); } - void* AllocWorkspace(Device dev, size_t size, DLDataType type_hint) final { + void *AllocWorkspace(Device dev, size_t size, DLDataType type_hint) final { return MACAThreadEntry::ThreadLocal()->pool.AllocWorkspace(dev, size); } - void FreeWorkspace(Device dev, void* data) final { + void FreeWorkspace(Device dev, void *data) final { MACAThreadEntry::ThreadLocal()->pool.FreeWorkspace(dev, data); } - static MACADeviceAPI* Global() { - static MACADeviceAPI* inst = new MACADeviceAPI(); + static MACADeviceAPI *Global() { + static MACADeviceAPI *inst = new MACADeviceAPI(); return inst; } - private: - static void GPUCopy(const void* from, void* to, size_t size, mcMemcpyKind kind, - mcStream_t stream) { +private: + static void GPUCopy(const void *from, void *to, size_t size, + mcMemcpyKind kind, mcStream_t stream) { MACA_CALL(mcMemcpyAsync(to, from, size, kind, stream)); } }; @@ -251,24 +262,27 @@ typedef dmlc::ThreadLocalStore MACAThreadStore; MACAThreadEntry::MACAThreadEntry() : pool(kDLMACA, MACADeviceAPI::Global()) {} -MACAThreadEntry* MACAThreadEntry::ThreadLocal() { return MACAThreadStore::Get(); } +MACAThreadEntry *MACAThreadEntry::ThreadLocal() { + return MACAThreadStore::Get(); +} TVM_FFI_STATIC_INIT_BLOCK() { namespace refl = tvm::ffi::reflection; refl::GlobalDef() - .def_packed("device_api.maca", - [](ffi::PackedArgs args, ffi::Any *rv) { - DeviceAPI* ptr = MACADeviceAPI::Global(); - *rv = static_cast(ptr); - }) - .def_packed("device_api.maca_host", [](ffi::PackedArgs args, ffi::Any* rv) { - DeviceAPI* ptr = MACADeviceAPI::Global(); - *rv = static_cast(ptr); - }); + .def_packed("device_api.maca", + [](ffi::PackedArgs args, ffi::Any *rv) { + DeviceAPI *ptr = MACADeviceAPI::Global(); + *rv = static_cast(ptr); + }) + .def_packed("device_api.maca_host", + [](ffi::PackedArgs args, ffi::Any *rv) { + DeviceAPI *ptr = MACADeviceAPI::Global(); + *rv = static_cast(ptr); + }); } class MACATimerNode : public TimerNode { - public: +public: virtual void Start() { int device_id; MACA_CALL(mcGetDevice(&device_id)); @@ -293,9 +307,10 @@ class MACATimerNode : public TimerNode { MACA_CALL(mcEventCreate(&stop_)); } - TVM_FFI_DECLARE_OBJECT_INFO_FINAL("runtime.maca.MACATimerNode", MACATimerNode, TimerNode); + TVM_FFI_DECLARE_OBJECT_INFO_FINAL("runtime.maca.MACATimerNode", MACATimerNode, + TimerNode); - private: +private: mcEvent_t start_; mcEvent_t stop_; TVMStreamHandle stream_; @@ -303,8 +318,9 @@ class MACATimerNode : public TimerNode { TVM_FFI_STATIC_INIT_BLOCK() { namespace refl = tvm::ffi::reflection; - refl::GlobalDef().def("profiling.timer.maca", - [](Device dev) { return Timer(ffi::make_object()); }); + refl::GlobalDef().def("profiling.timer.maca", [](Device dev) { + return Timer(ffi::make_object()); + }); } -} // namespace runtime -} // namespace tvm +} // namespace runtime +} // namespace tvm diff --git a/src/target/maca_module.cc b/src/target/maca_module.cc index 10cccc4f039542cf2d723960154920e68e6be4aa..b8c122c4797c10eb58cea6bfd2f19a277eba6e26 100644 --- a/src/target/maca_module.cc +++ b/src/target/maca_module.cc @@ -34,11 +34,11 @@ #include #include +#include "maca_common.h" #include "runtime/file_utils.h" #include "runtime/meta_data.h" #include "runtime/pack_args.h" #include "runtime/thread_storage_scope.h" -#include "maca_common.h" namespace tvm { namespace runtime { @@ -48,7 +48,7 @@ namespace runtime { // The runtime will contain a per-device module table // The modules will be lazily loaded class MACAModuleNode : public ffi::ModuleObj { - public: +public: explicit MACAModuleNode(std::string data, std::string fmt, std::unordered_map fmap, std::string maca_source) @@ -65,14 +65,15 @@ class MACAModuleNode : public ffi::ModuleObj { } } - const char* kind() const final { return "maca"; } + const char *kind() const final { return "maca"; } int GetPropertyMask() const final { return ffi::Module::kBinarySerializable | ffi::Module::kRunnable; } - ffi::Optional GetFunction(const ffi::String& name) final; + ffi::Optional GetFunction(const ffi::String &name) final; - void WriteToFile(const ffi::String& file_name, const ffi::String& format) const final { + void WriteToFile(const ffi::String &file_name, + const ffi::String &format) const final { std::string fmt = GetFileFormat(file_name, format); std::string meta_file = GetMetaFilePath(file_name); if (fmt == "maca") { @@ -89,25 +90,27 @@ class MACAModuleNode : public ffi::ModuleObj { ffi::Bytes SaveToBytes() const final { std::string buffer; dmlc::MemoryStringStream ms(&buffer); - dmlc::Stream* stream = &ms; + dmlc::Stream *stream = &ms; stream->Write(fmt_); stream->Write(fmap_); stream->Write(data_); return ffi::Bytes(buffer); } - ffi::String InspectSource(const ffi::String& format) const final { - if (format == fmt_) return data_; + ffi::String InspectSource(const ffi::String &format) const final { + if (format == fmt_) + return data_; if (maca_source_.length() != 0) { return maca_source_; } else { - if (fmt_ == "fatbin") return data_; + if (fmt_ == "fatbin") + return data_; return ""; } } // get a mcfunction_t from primary context in device_id - mcFunction_t GetFunc(int device_id, const std::string& func_name) { + mcFunction_t GetFunc(int device_id, const std::string &func_name) { std::lock_guard lock(mutex_); // must recheck under the lock scope @@ -115,7 +118,8 @@ class MACAModuleNode : public ffi::ModuleObj { MACA_DRIVER_CALL(mcModuleLoadData(&(module_[device_id]), data_.c_str())); } mcFunction_t func; - mcError_t result = mcModuleGetFunction(&func, module_[device_id], func_name.c_str()); + mcError_t result = + mcModuleGetFunction(&func, module_[device_id], func_name.c_str()); if (result != mcSuccess) { LOG(FATAL) << "MACAError: mcModuleGetFunction " << func_name << " failed with error: " << mcGetErrorString(result); @@ -123,7 +127,8 @@ class MACAModuleNode : public ffi::ModuleObj { return func; } // get a global var from primary context in device_id - mcDeviceptr_t GetGlobal(int device_id, const std::string& global_name, size_t expect_nbytes) { + mcDeviceptr_t GetGlobal(int device_id, const std::string &global_name, + size_t expect_nbytes) { std::lock_guard lock(mutex_); // must recheck under the lock scope if (module_[device_id] == nullptr) { @@ -132,12 +137,13 @@ class MACAModuleNode : public ffi::ModuleObj { mcDeviceptr_t global = nullptr; size_t nbytes = 0; - MACA_DRIVER_CALL(mcModuleGetGlobal(&global, &nbytes, module_[device_id], global_name.c_str())); + MACA_DRIVER_CALL(mcModuleGetGlobal(&global, &nbytes, module_[device_id], + global_name.c_str())); ICHECK_EQ(nbytes, expect_nbytes); return global; } - private: +private: // the binary data std::string data_; // The format @@ -154,10 +160,11 @@ class MACAModuleNode : public ffi::ModuleObj { // a wrapped function class to get packed func. class MACAWrappedFunc { - public: +public: // initialize the MACA function. - void Init(MACAModuleNode* m, ObjectPtr sptr, const std::string& func_name, - size_t num_void_args, const std::vector& launch_param_tags) { + void Init(MACAModuleNode *m, ObjectPtr sptr, + const std::string &func_name, size_t num_void_args, + const std::vector &launch_param_tags) { m_ = m; sptr_ = sptr; func_name_ = func_name; @@ -165,7 +172,7 @@ class MACAWrappedFunc { launch_param_config_.Init(num_void_args, launch_param_tags); } // invoke the function with void arguments - void operator()(ffi::PackedArgs args, ffi::Any* rv, void* packed_args, + void operator()(ffi::PackedArgs args, ffi::Any *rv, void *packed_args, size_t packed_nbytes) const { int device_id; MACA_CALL(mcGetDevice(&device_id)); @@ -173,21 +180,23 @@ class MACAWrappedFunc { fcache_[device_id] = m_->GetFunc(device_id, func_name_); } - mcStream_t strm = static_cast(TVMFFIEnvGetStream(kDLMACA, device_id)); + mcStream_t strm = + static_cast(TVMFFIEnvGetStream(kDLMACA, device_id)); ThreadWorkLoad wl = launch_param_config_.Extract(args); - void* config[] = {MC_LAUNCH_PARAM_BUFFER_POINTER, packed_args, MC_LAUNCH_PARAM_BUFFER_SIZE, - &packed_nbytes, MC_LAUNCH_PARAM_END}; + void *config[] = {MC_LAUNCH_PARAM_BUFFER_POINTER, packed_args, + MC_LAUNCH_PARAM_BUFFER_SIZE, &packed_nbytes, + MC_LAUNCH_PARAM_END}; // MACA supports only extra_args. - MACA_DRIVER_CALL(mcModuleLaunchKernel(fcache_[device_id], wl.grid_dim(0), wl.grid_dim(1), - wl.grid_dim(2), wl.block_dim(0), wl.block_dim(1), - wl.block_dim(2), wl.dyn_shmem_size, strm, nullptr, - reinterpret_cast(&config))); + MACA_DRIVER_CALL(mcModuleLaunchKernel( + fcache_[device_id], wl.grid_dim(0), wl.grid_dim(1), wl.grid_dim(2), + wl.block_dim(0), wl.block_dim(1), wl.block_dim(2), wl.dyn_shmem_size, + strm, nullptr, reinterpret_cast(&config))); } - private: +private: // internal module - MACAModuleNode* m_; + MACAModuleNode *m_; // the resource holder ObjectPtr sptr_; // The name of the function. @@ -199,25 +208,29 @@ class MACAWrappedFunc { LaunchParamConfig launch_param_config_; }; -ffi::Optional MACAModuleNode::GetFunction(const ffi::String& name) { +ffi::Optional +MACAModuleNode::GetFunction(const ffi::String &name) { ObjectPtr sptr_to_self = ffi::GetObjectPtr(this); ICHECK_EQ(sptr_to_self.get(), this); auto it = fmap_.find(name); - if (it == fmap_.end()) return ffi::Function(); - const FunctionInfo& info = it->second; + if (it == fmap_.end()) + return ffi::Function(); + const FunctionInfo &info = it->second; MACAWrappedFunc f; - f.Init(this, sptr_to_self, name, info.arg_types.size(), info.launch_param_tags); + f.Init(this, sptr_to_self, name, info.arg_types.size(), + info.launch_param_tags); return PackFuncPackedArgAligned(f, info.arg_types); } ffi::Module MACAModuleCreate(std::string data, std::string fmt, - std::unordered_map fmap, - std::string maca_source) { + std::unordered_map fmap, + std::string maca_source) { auto n = ffi::make_object(data, fmt, fmap, maca_source); return ffi::Module(n); } -ffi::Module MACAModuleLoadFile(const std::string& file_name, const ffi::String& format) { +ffi::Module MACAModuleLoadFile(const std::string &file_name, + const ffi::String &format) { std::string data; std::unordered_map fmap; std::string fmt = GetFileFormat(file_name, format); @@ -227,9 +240,10 @@ ffi::Module MACAModuleLoadFile(const std::string& file_name, const ffi::String& return MACAModuleCreate(data, fmt, fmap, std::string()); } -ffi::Module MACAModuleLoadFromBytes(const ffi::Bytes& bytes) { - dmlc::MemoryFixedSizeStream ms(const_cast(bytes.data()), bytes.size()); - dmlc::Stream* stream = &ms; +ffi::Module MACAModuleLoadFromBytes(const ffi::Bytes &bytes) { + dmlc::MemoryFixedSizeStream ms(const_cast(bytes.data()), + bytes.size()); + dmlc::Stream *stream = &ms; std::string data; std::unordered_map fmap; std::string fmt; @@ -242,8 +256,8 @@ ffi::Module MACAModuleLoadFromBytes(const ffi::Bytes& bytes) { TVM_FFI_STATIC_INIT_BLOCK() { namespace refl = tvm::ffi::reflection; refl::GlobalDef() - .def("ffi.Module.load_from_file.maca", MACAModuleLoadFile) - .def("ffi.Module.load_from_bytes.maca", MACAModuleLoadFromBytes); + .def("ffi.Module.load_from_file.maca", MACAModuleLoadFile) + .def("ffi.Module.load_from_bytes.maca", MACAModuleLoadFromBytes); } -} // namespace runtime -} // namespace tvm +} // namespace runtime +} // namespace tvm diff --git a/src/target/maca_module.h b/src/target/maca_module.h index a1dc2a43a7474b4593bbf62de138493038bab9d8..46896ae0c8427b9a190ad11a7587497ee88f54ce 100644 --- a/src/target/maca_module.h +++ b/src/target/maca_module.h @@ -48,8 +48,8 @@ static constexpr const int kMaxNumGPUs = 32; * \param maca_source Optional, maca source file */ ffi::Module MACAModuleCreate(std::string data, std::string fmt, - std::unordered_map fmap, - std::string maca_source); -} // namespace runtime -} // namespace tvm -#endif // TVM_RUNTIME_MACA_MACA_MODULE_H_ + std::unordered_map fmap, + std::string maca_source); +} // namespace runtime +} // namespace tvm +#endif // TVM_RUNTIME_MACA_MACA_MODULE_H_ diff --git a/src/target/maca_target_kind.cc b/src/target/maca_target_kind.cc index 4ceff4cdc2ec52340b56a4a03318135ea198090f..bb2ec579bf38ffcc39d302f633b2d27104772429 100644 --- a/src/target/maca_target_kind.cc +++ b/src/target/maca_target_kind.cc @@ -3,24 +3,28 @@ namespace tvm { -std::string ExtractStringWithPrefix(const std::string& str, const std::string& prefix) { - if (str.find(prefix) != 0) return ""; +std::string ExtractStringWithPrefix(const std::string &str, + const std::string &prefix) { + if (str.find(prefix) != 0) + return ""; std::size_t pos = prefix.length(); - while (pos < str.length() && (std::isdigit(str[pos]) || std::isalpha(str[pos]))) { + while (pos < str.length() && + (std::isdigit(str[pos]) || std::isalpha(str[pos]))) { ++pos; } return str.substr(prefix.length(), pos - prefix.length()); } -void CheckOrSetAttr(ffi::Map* attrs, const ffi::String& name, - const ffi::String& value) { +void CheckOrSetAttr(ffi::Map *attrs, + const ffi::String &name, const ffi::String &value) { auto iter = attrs->find(name); if (iter == attrs->end()) { attrs->Set(name, value); } else { auto str = (*iter).second.try_cast(); - ICHECK(str && str.value() == value) << "ValueError: Expects \"" << name << "\" to be \"" - << value << "\", but gets: " << (*iter).second; + ICHECK(str && str.value() == value) + << "ValueError: Expects \"" << name << "\" to be \"" << value + << "\", but gets: " << (*iter).second; } } @@ -36,10 +40,12 @@ TargetJSON UpdateMACAAttrs(TargetJSON target) { if (target.count("mcpu")) { ffi::String mcpu = Downcast(target.at("mcpu")); arch = ExtractStringWithPrefix(mcpu, "xcore"); - ICHECK(!arch.empty()) << "ValueError: MACA target gets an invalid XCORE version: -mcpu=" - << mcpu; + ICHECK(!arch.empty()) + << "ValueError: MACA target gets an invalid XCORE version: -mcpu=" + << mcpu; } else { - if (auto f_get_maca_arch = tvm::ffi::Function::GetGlobal("tvm_callback_maca_get_arch")) { + if (auto f_get_maca_arch = + tvm::ffi::Function::GetGlobal("tvm_callback_maca_get_arch")) { arch = (*f_get_maca_arch)().cast(); } target.Set("mcpu", ffi::String(arch)); diff --git a/src/target/rt_mod_maca.cc b/src/target/rt_mod_maca.cc index 9c9597f17817faf27e41bfca9d2678693be39a2e..281d96b95838fd57dc29d552830aeea1e9daa672 100644 --- a/src/target/rt_mod_maca.cc +++ b/src/target/rt_mod_maca.cc @@ -1,4 +1,5 @@ -// Copyright (c) 2025 MetaX Integrated Circuits (Shanghai) Co., Ltd. All rights reserved. +// Copyright (c) 2025 MetaX Integrated Circuits (Shanghai) Co., Ltd. All rights +// reserved. #include "../transform/common/attr.h" #include "codegen_maca.h" @@ -118,9 +119,9 @@ ffi::Module BuildTileLangMACAWithoutCompile(IRModule mod, Target target) { TVM_FFI_STATIC_INIT_BLOCK() { namespace refl = tvm::ffi::reflection; refl::GlobalDef() - .def("target.build.tilelang_maca", BuildTileLangMACA) - .def("target.build.tilelang_maca_without_compile", - BuildTileLangMACAWithoutCompile); + .def("target.build.tilelang_maca", BuildTileLangMACA) + .def("target.build.tilelang_maca_without_compile", + BuildTileLangMACAWithoutCompile); } } // namespace codegen diff --git a/src/target/utils.cc b/src/target/utils.cc index 0e35bf9b88ed044e00091ccb69c0874adc0a6c6c..e35f1891984a224074ac5618e367a32fcb192289 100644 --- a/src/target/utils.cc +++ b/src/target/utils.cc @@ -1,4 +1,5 @@ -// 2025 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved. +// 2025 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights +// Reserved. /*! * \file tl/target/utils.cc * \brief helper functions for target attributes. diff --git a/src/target/utils.h b/src/target/utils.h index 6ed6cc9e2f6faa8aa9ef717a486e1687fb7a3888..c569c4e615aabafda1d1e677ea1f379857e218d9 100644 --- a/src/target/utils.h +++ b/src/target/utils.h @@ -1,4 +1,5 @@ -// 2025 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved. +// 2025 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights +// Reserved. /*! * \file tl/target/utils.h * \brief helper functions for target attributes. diff --git a/src/tl_templates/maca/common.h b/src/tl_templates/maca/common.h index 9b0197291768f4e2dbc4a25c184708c5eb7aa489..24b5d1418df4a5aa3edb0f485cfa71cfc39c7fc7 100644 --- a/src/tl_templates/maca/common.h +++ b/src/tl_templates/maca/common.h @@ -1,12 +1,13 @@ -// Copyright (c) 2025 MetaX Integrated Circuits (Shanghai) Co., Ltd. All rights reserved. +// Copyright (c) 2025 MetaX Integrated Circuits (Shanghai) Co., Ltd. All rights +// reserved. #pragma once -#include #include #include -#include #include +#include +#include #include #define MACART_INF_F __int_as_float(0x7f800000) @@ -31,20 +32,20 @@ #define TILELANG_CHECK(stmt) \ do { \ - mcError_t __err = (stmt); \ - if (__err != mcSuccess) { \ + mcError_t __err = (stmt); \ + if (__err != mcSuccess) { \ snprintf(error_buf, ERROR_BUF_SIZE, "%s:%d: %s - %s", __FILE__, \ - __LINE__, mcGetErrorName(__err), mcGetErrorString(__err)); \ + __LINE__, mcGetErrorName(__err), mcGetErrorString(__err)); \ return -1; \ } \ } while (0) #define TILELANG_CHECK_LAST_ERROR(kernel_name) \ do { \ - mcError_t __err = mcGetLastError(); \ - if (__err != mcSuccess) { \ + mcError_t __err = mcGetLastError(); \ + if (__err != mcSuccess) { \ snprintf(error_buf, ERROR_BUF_SIZE, "kernel_name: %s - %s", \ - mcGetErrorName(__err), mcGetErrorString(__err)); \ + mcGetErrorName(__err), mcGetErrorString(__err)); \ return -1; \ } \ } while (0) @@ -101,7 +102,8 @@ TL_DEVICE unsigned __pack_half2(const half_t x, const half_t y) { } // Pack two bfloat16_t values. -TL_DEVICE unsigned __pack_maca_bfloat162(const bfloat16_t x, const bfloat16_t y) { +TL_DEVICE unsigned __pack_maca_bfloat162(const bfloat16_t x, + const bfloat16_t y) { unsigned v0 = *((unsigned short *)&x); unsigned v1 = *((unsigned short *)&y); return (v1 << 16) | v0; @@ -113,8 +115,7 @@ TL_DEVICE void AtomicAdd(T1 *address, T2 val, int memory_order = 0) { atomicAdd(reinterpret_cast(address), static_cast(val)); } -template -TL_DEVICE void AtomicAdd(_Float16 *address, T val) { +template TL_DEVICE void AtomicAdd(_Float16 *address, T val) { atomicAdd(reinterpret_cast<__half *>(address), static_cast<__half>(val)); } @@ -129,11 +130,14 @@ TL_DEVICE half_t min(const half_t a, const half_t b) { // DP4A TL_DEVICE int __dp4a(int srcA, int srcB, int c) { int4 v_srca{(signed char)(srcA & 0xff), (signed char)((srcA >> 8) & 0xff), - (signed char)((srcA >> 16) & 0xff), (signed char)((srcA >> 24) & 0xff)}; + (signed char)((srcA >> 16) & 0xff), + (signed char)((srcA >> 24) & 0xff)}; int4 v_srcb{(signed char)(srcB & 0xff), (signed char)((srcB >> 8) & 0xff), - (signed char)((srcB >> 16) & 0xff), (signed char)((srcB >> 24) & 0xff)}; + (signed char)((srcB >> 16) & 0xff), + (signed char)((srcB >> 24) & 0xff)}; - return v_srca.x * v_srcb.x + v_srca.y * v_srcb.y + v_srca.z * v_srcb.z + v_srca.w * v_srcb.w + c; + return v_srca.x * v_srcb.x + v_srca.y * v_srcb.y + v_srca.z * v_srcb.z + + v_srca.w * v_srcb.w + c; } // Helper to cast SMEM pointer to unsigned diff --git a/src/tl_templates/maca/debug.h b/src/tl_templates/maca/debug.h index 874bef4dbd2e4c985fdd43d84937267000c8a9ed..22b9423b5e4877eb002b98c08f6df6adaa938674 100644 --- a/src/tl_templates/maca/debug.h +++ b/src/tl_templates/maca/debug.h @@ -1,4 +1,5 @@ -// Copyright (c) 2025 MetaX Integrated Circuits (Shanghai) Co., Ltd. All rights reserved. +// Copyright (c) 2025 MetaX Integrated Circuits (Shanghai) Co., Ltd. All rights +// reserved. #pragma once @@ -53,7 +54,6 @@ template <> __device__ void debug_print_var(const char *msg, half var) { threadIdx.z, (float)var); } - // Specialization for bfloat16_t type template <> __device__ void debug_print_var(const char *msg, bfloat16_t var) { @@ -86,10 +86,9 @@ __device__ void debug_print_var(const char *msg, fp8_e4_t var) { // template <> // __device__ void debug_print_var(const char *msg, fp8_e5_t var) { // printf( -// "msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=fp8_e5_t " -// "value=%f\n", -// msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, -// threadIdx.z, (float)var); +// "msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=fp8_e5_t +// " "value=%f\n", msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, +// threadIdx.y, threadIdx.z, (float)var); // } // Template declaration for device-side debug printing (buffer only) @@ -190,7 +189,8 @@ __device__ void debug_print_buffer_value(const char *msg, // __device__ void debug_print_buffer_value(const char *msg, // const char *buf_name, // int index, fp8_e5_t var) { -// printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, " +// printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, +// " // "index=%d, dtype=fp8_e5_t value=%f\n", // msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, // threadIdx.z, buf_name, index, (float)var); diff --git a/src/tl_templates/maca/gemm.h b/src/tl_templates/maca/gemm.h index 47f11b89d9c70bcd87ac0be66269a46e24c291d3..681922ba4976e901974baa577e743f8064fdbdbb 100644 --- a/src/tl_templates/maca/gemm.h +++ b/src/tl_templates/maca/gemm.h @@ -1,4 +1,5 @@ -// Copyright (c) 2025 MetaX Integrated Circuits (Shanghai) Co., Ltd. All rights reserved. +// Copyright (c) 2025 MetaX Integrated Circuits (Shanghai) Co., Ltd. All rights +// reserved. #pragma once @@ -11,8 +12,7 @@ namespace cute { template struct DispatchInstruction; -template <> -struct DispatchInstruction { +template <> struct DispatchInstruction { using MMA = MMA_Atom>; }; @@ -24,7 +24,7 @@ template struct OperandTraits<16, N, K, true, num_warp_n, typename std::enable_if::type> { using LayoutAtom = decltype(composition( - Swizzle<2, 3, 3>{}, Layout, Stride<_32, _1>>{})); + Swizzle<2, 3, 3>{}, Layout, Stride<_32, _1>>{})); using Layout = decltype(tile_to_shape(LayoutAtom{}, Shape, Int>{})); using Copy = Copy_Traits>; }; @@ -68,8 +68,10 @@ public: using Instruction = DispatchInstruction; - using OperandATraits = OperandTraits::value, M, K, !trans_A, num_warp_m>; - using OperandBTraits = OperandTraits::value, N, K, trans_B, num_warp_n>; + using OperandATraits = + OperandTraits::value, M, K, !trans_A, num_warp_m>; + using OperandBTraits = + OperandTraits::value, N, K, trans_B, num_warp_n>; using SmemLayoutA = typename OperandATraits::Layout; using SmemLayoutB = typename OperandBTraits::Layout; @@ -159,9 +161,8 @@ template MCTLASS_DEVICE void gemm_ss(A_type *pA, B_type *pB, C_type *accum) { - using MMA = - cute::GemmTensorOp; + using MMA = cute::GemmTensorOp; MMA::body(pA, pB, accum); } @@ -169,9 +170,8 @@ template TL_DEVICE void gemm_rs(A_type *pA, B_type *pB, C_type *accum) { - using MMA = - cute::GemmTensorOp; + using MMA = cute::GemmTensorOp; MMA::body_rs(pA, pB, accum); } } // namespace tl diff --git a/src/tl_templates/maca/reduce.h b/src/tl_templates/maca/reduce.h index ecce05745bc72bb1a50c5b73b21f48657f00d6ba..e455ed52677fc88b125519ad60c0c8330e39216e 100644 --- a/src/tl_templates/maca/reduce.h +++ b/src/tl_templates/maca/reduce.h @@ -1,4 +1,5 @@ -// Copyright (c) 2025 MetaX Integrated Circuits (Shanghai) Co., Ltd. All rights reserved. +// Copyright (c) 2025 MetaX Integrated Circuits (Shanghai) Co., Ltd. All rights +// reserved. #pragma once diff --git a/src/tl_templates/maca/threadblock_swizzle.h b/src/tl_templates/maca/threadblock_swizzle.h index 60671cbfef178e311193979698722c57bc3d36d2..1f2f5b0eeb88bf0da4a15a4b26a8750c033f4ac6 100644 --- a/src/tl_templates/maca/threadblock_swizzle.h +++ b/src/tl_templates/maca/threadblock_swizzle.h @@ -1,4 +1,5 @@ -// Copyright (c) 2025 MetaX Integrated Circuits (Shanghai) Co., Ltd. All rights reserved. +// Copyright (c) 2025 MetaX Integrated Circuits (Shanghai) Co., Ltd. All rights +// reserved. #pragma once diff --git a/src/transform/arg_binder.cc b/src/transform/arg_binder.cc index 4b92c0a15f7a95c171928f14e2b13b101fe76d6b..9909588e98185daadfc28303306a3ac42ffdc24b 100644 --- a/src/transform/arg_binder.cc +++ b/src/transform/arg_binder.cc @@ -1041,7 +1041,9 @@ void ArgBinder::BindDLTensors( // Check device_type consistency (device_id equality is implicitly ensured // by binding above) { - PrimExpr ok = (device_type == actual_dev_type) || (device_type == DLDeviceType::kDLMACA && actual_dev_type == DLDeviceType::kDLCUDA); + PrimExpr ok = (device_type == actual_dev_type) || + (device_type == DLDeviceType::kDLMACA && + actual_dev_type == DLDeviceType::kDLCUDA); ffi::Array pargs2; pargs2.push_back(StringImm(tvm_error_device_type_mismatch)); pargs2.push_back(StringImm(kernel_nm)); diff --git a/test/test.commit b/test/test.commit new file mode 100644 index 0000000000000000000000000000000000000000..11819826f8a5c38ada1cf190af772c4b6da989dd --- /dev/null +++ b/test/test.commit @@ -0,0 +1,3 @@ +测试1 +测试2 +测试3 diff --git a/testing/python/conftest.py b/testing/python/conftest.py index a6766a8df67cf55c71e01af1f32d62ef1330ac65..05fba382ce020024d8c92d8fe213a8e37bdef09e 100644 --- a/testing/python/conftest.py +++ b/testing/python/conftest.py @@ -3,15 +3,12 @@ import os import pytest + def _parameterize_target(metafunc): - # ENV variable TILELANG_TEST_TARGETS specify target names splited by ";" + # ENV variable TILELANG_TEST_TARGETS specify target names split by ";" # default value is maca if "target" in metafunc.fixturenames: - parametrized_args = [ - arg.strip() - for mark in metafunc.definition.iter_markers("parametrize") - for arg in mark.args[0].split(",") - ] + parametrized_args = [arg.strip() for mark in metafunc.definition.iter_markers("parametrize") for arg in mark.args[0].split(",")] if "target" not in parametrized_args: mark = pytest.mark.parametrize( "target", @@ -20,5 +17,6 @@ def _parameterize_target(metafunc): ) metafunc.definition.add_marker(mark) + def pytest_generate_tests(metafunc): - _parameterize_target(metafunc) \ No newline at end of file + _parameterize_target(metafunc) diff --git a/tilelang/carver/arch/maca.py b/tilelang/carver/arch/maca.py index 127503c927c640f8746f750e29ab580dbdeed911..280d2dc5c0ac1699c91c9655cc0aafba39680775 100644 --- a/tilelang/carver/arch/maca.py +++ b/tilelang/carver/arch/maca.py @@ -1,18 +1,19 @@ # Copyright (c) 2025 MetaX Integrated Circuits (Shanghai) Co., Ltd. All rights reserved. +from __future__ import annotations import tvm from tvm.target import Target from .arch_base import TileDevice -from typing import List, Union from .cuda import TensorInstruction + def is_maca_arch(arch: TileDevice) -> bool: return isinstance(arch, MACA) class MACA(TileDevice): # FIXME: config should meets MACA - def __init__(self, target: Union[Target, str]): + def __init__(self, target: Target | str): if isinstance(target, str): target = tvm.target.Target(target) self.target = target @@ -29,12 +30,10 @@ class MACA(TileDevice): self.max_smem_usage: int = 2 * self.smem_cap self.sm_partition: int = 8 self.l2_cache_size_bytes: int = target.l2_cache_size_bytes - self.transaction_size: List[int] = [32, 128] # in bytes + self.transaction_size: list[int] = [32, 128] # in bytes - self.bandwidth: List[int] = [750, 12080] + self.bandwidth: list[int] = [750, 12080] def get_avaliable_tensorintrin_shapes(self): - self.available_tensor_instructions = ( - TensorInstruction("wmma", [16, 16]), - ) + self.available_tensor_instructions = (TensorInstruction("wmma", [16, 16]),) return [t.shape for t in self.available_tensor_instructions] diff --git a/tilelang/carver/roller/hint.py b/tilelang/carver/roller/hint.py index f65ad705d02b1ce2de27f38d497bc94d7361c37b..d290a14af1502acf7be60a6628766255c9acb394 100644 --- a/tilelang/carver/roller/hint.py +++ b/tilelang/carver/roller/hint.py @@ -111,6 +111,7 @@ class TileDict: def __repr__(self) -> str: return str(self) + class IntrinInfo: """ The information of tensorcore intrinsic related information diff --git a/tilelang/contrib/mxcc.py b/tilelang/contrib/mxcc.py index 2e188eca100c54af241dd24d78693a8d843249fb..13ef021a23381d4b5850932326a417b63f93771b 100644 --- a/tilelang/contrib/mxcc.py +++ b/tilelang/contrib/mxcc.py @@ -236,8 +236,7 @@ def get_target_compute_version(target=None): return tvm.maca(0).compute_version raise ValueError( - "No MACA architecture was specified or GPU detected." - "Try specifying it by adding '--offload-arch=xcorexxxx' to your target." + "No MACA architecture was specified or GPU detected.Try specifying it by adding '--offload-arch=xcorexxxx' to your target." ) diff --git a/tilelang/intrinsics/maca_mma_macro_generator.py b/tilelang/intrinsics/maca_mma_macro_generator.py index f551a0c5f78921dbb4c7012de8f0783386023759..6eca0b527da446372fdd3dc76fecddf21f494eff 100644 --- a/tilelang/intrinsics/maca_mma_macro_generator.py +++ b/tilelang/intrinsics/maca_mma_macro_generator.py @@ -18,16 +18,12 @@ from .mfma_layout import ( shared_16x16_to_local_64x4_layout_B, shared_16x32_to_local_64x8_layout_A, shared_16x32_to_local_64x8_layout_B, - shared_16x64_to_local_64x16_layout_A, - shared_16x64_to_local_64x16_layout_B, thread_id_shared_access_64x1_to_16x4_layout_A, thread_id_shared_access_64x1_to_4x16_layout_B, thread_id_shared_access_64x4_to_16x16_layout_A, thread_id_shared_access_64x4_to_16x16_layout_B, thread_id_shared_access_64x8_to_16x32_layout_A, thread_id_shared_access_64x8_to_16x32_layout_B, - thread_id_shared_access_64x16_to_16x64_layout_A, - thread_id_shared_access_64x16_to_16x64_layout_B, ) lift = convert @@ -130,9 +126,8 @@ class TensorCoreIntrinEmitter: self.accum_dtype_abbrv = self.dtype_abbrv[accum_dtype] def _initialize_mma_prefix(self, k_dim=16): - in_dtype, out_dtype = self.a_dtype, self.accum_dtype + in_dtype = self.a_dtype M_DIM, N_DIM = self.M_DIM, self.N_DIM - out_dtype_abbrv = {T.float16: "f16", T.float32: "f32", T.int8: "i8", T.int32: "i32"}[out_dtype] in_dtype_abbrv = { "bfloat16": "bf16", @@ -419,7 +414,7 @@ class TensorCoreIntrinEmitter: tx, warp_n, warp_m = self.extract_thread_binding(thread_binding) for i, j in T.grid(warp_rows, warp_cols): for local_id in T.vectorized(local_size_out): - row, col = T.meta_var(maca_mma_store_index_map(tx, local_id)) + row, col = T.meta_var(mfma_store_index_map(tx, local_id)) C_buf[ (pid_m * BLOCK_M + warp_m * warp_rows + i) * M_DIM + row, (pid_n * BLOCK_N + warp_n * warp_cols + j) * N_DIM + col ] = C_local_buf[i * warp_cols * local_size_out + j * local_size_out + local_id] diff --git a/tilelang/intrinsics/mfma_macro_generator.py b/tilelang/intrinsics/mfma_macro_generator.py index 851d64d35a3f9faa699e7e6fa2f0a255068c56a0..984476a4977f1116c7cd29d21622aa43aa54443a 100644 --- a/tilelang/intrinsics/mfma_macro_generator.py +++ b/tilelang/intrinsics/mfma_macro_generator.py @@ -186,6 +186,7 @@ class MatrixCoreIntrinEmitter: self.mfma_suffix = f"{out_dtype_abbrv}_{M_DIM}x{N_DIM}x{k_dim}{in_dtype_abbrv}" else: import logging + logger = logging.getLogger(__name__) logger.warning("need to fix mfma suffix") self.mfma_suffix = f"{M_DIM}x{N_DIM}x{k_dim}{in_dtype_abbrv}" diff --git a/tilelang/quantize/lop3_maca.py b/tilelang/quantize/lop3_maca.py index 77b094a7535881a6e9f964c1207765d625e8eb34..9508b391fe6caec562764063a2aed601b9ecdad0 100644 --- a/tilelang/quantize/lop3_maca.py +++ b/tilelang/quantize/lop3_maca.py @@ -1,6 +1,5 @@ # Copyright (c) 2025 MetaX Integrated Circuits (Shanghai) Co., Ltd. All rights reserved. -from typing import Dict, Literal decode_i4_to_f16 = """ #include "maca_fp16.h" @@ -36,6 +35,4 @@ __device__ void decode_i4u_to_f16(T1 *_i4u, T2 *B_local_decode, const int N = 8) } """ -import_maca_c_map = { - "i4_to_f16": decode_i4_to_f16 -} \ No newline at end of file +import_maca_c_map = {"i4_to_f16": decode_i4_to_f16} diff --git a/tilelang/tileop/gemm_sp/__init__.py b/tilelang/tileop/gemm_sp/__init__.py index 6b8f97cef0e43efc79cd8ee7ddd13d22e66d3303..9966abf0a9a9e10131d8f1f93abc42bebd3a5ba7 100644 --- a/tilelang/tileop/gemm_sp/__init__.py +++ b/tilelang/tileop/gemm_sp/__init__.py @@ -1,8 +1,6 @@ from tilelang import tvm as tvm from tvm import tir -from tilelang.utils.target import ( - target_is_cuda, target_is_maca -) +from tilelang.utils.target import target_is_cuda, target_is_maca from tvm.target import Target from tvm.ir.base import Node from tvm.ir import Range