diff --git a/docs/lite/api/source_en/conf.py b/docs/lite/api/source_en/conf.py index d407601c2359cfc57ef4d0a2e69ce2a17bafa803..761b49102147e755b4a2115890f639c7950d8c50 100644 --- a/docs/lite/api/source_en/conf.py +++ b/docs/lite/api/source_en/conf.py @@ -334,6 +334,39 @@ try: except: print('lite替换安装包内容失败') +# 发版本时这里启用 +# modify urls +# re_url = r"(((gitee.com/mindspore/docs/mindspore-lite)|(github.com/mindspore-ai/(mindspore|docs))|" + \ +# r"(mindspore.cn/(docs|tutorials|lite))|(obs.dualstack.cn-north-4.myhuaweicloud)|" + \ +# r"(mindspore-website.obs.cn-north-4.myhuaweicloud))[\w\d/_.-]*?)/(master)" + +# re_url2 = r"(gitee.com/mindspore/mindspore[\w\d/_.-]*?)/(master)" + +# re_url3 = r"(((gitee.com/mindspore/mindformers)|(mindspore.cn/mindformers))[\w\d/_.-]*?)/(dev)" + +# with open(os.path.join('./mindspore_lite.rst'), 'r+', encoding='utf-8') as f: +# content = f.read() +# new_content = re.sub(re_url, r'\1/r2.7.0rc1', content) +# new_content = re.sub(re_url2, r'\1/v2.7.0-rc1', new_content) +# new_content = re.sub(re_url4, r'\1/r1.6.0', new_content) +# if new_content != content: +# f.seek(0) +# f.truncate() +# f.write(new_content) + +# for cur, _, files in os.walk(os.path.join(base_path, 'mindspore_lite')): +# for i in files: +# if i.endswith('.py'): +# with open(os.path.join(cur, i), 'r+', encoding='utf-8') as f: +# content = f.read() +# new_content = re.sub(re_url, r'\1/r2.7.0rc1', content) +# new_content = re.sub(re_url2, r'\1/v2.7.0-rc1', new_content) +# new_content = re.sub(re_url3, r'\1/r1.6.0', new_content) +# if new_content != content: +# f.seek(0) +# f.truncate() +# f.write(new_content) + # modify urls import json diff --git a/docs/lite/api/source_zh_cn/conf.py b/docs/lite/api/source_zh_cn/conf.py index 6f9409f5d26993e99b7c947f1fee960c27e1beee..38393a47aef26c509ae6c561be135946bc6d2d80 100644 --- a/docs/lite/api/source_zh_cn/conf.py +++ b/docs/lite/api/source_zh_cn/conf.py @@ -233,8 +233,35 @@ docs_branch = [version_inf[i]['branch'] for i in range(len(version_inf)) if vers re_view = f"\n.. image:: https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/{docs_branch}/" + \ f"resource/_static/logo_source.svg\n :target: https://gitee.com/mindspore/{copy_repo}/blob/{branch}/" +# 发版本时这里启用 +# modify urls +# re_url = r"(((gitee.com/mindspore/docs/mindspore-lite)|(github.com/mindspore-ai/(mindspore|docs))|" + \ +# r"(mindspore.cn/(docs|tutorials|lite))|(obs.dualstack.cn-north-4.myhuaweicloud)|" + \ +# r"(mindspore-website.obs.cn-north-4.myhuaweicloud))[\w\d/_.-]*?)/(master)" + +# re_url2 = r"(gitee.com/mindspore/mindspore[\w\d/_.-]*?)/(master)" + +# re_url3 = r"(((gitee.com/mindspore/mindformers)|(mindspore.cn/mindformers))[\w\d/_.-]*?)/(dev)" + for cur, _, files in os.walk(present_path): for i in files: + # 发版本时这里启用 + # if i.endswith('.rst') or i.endswith('.md') or i.endswith('.ipynb'): + # try: + # with open(os.path.join(cur, i), 'r+', encoding='utf-8') as f: + # content = f.read() + # new_content = re.sub(re_url, r'\1/r2.7.0rc1', content) + # new_content = re.sub(re_url3, r'\1/r1.6.0', new_content) + # if i.endswith('.rst'): + # new_content = re.sub(re_url2, r'\1/v2.7.0-rc1', new_content) + # if new_content != content: + # f.seek(0) + # f.truncate() + # f.write(new_content) + # except Exception: + # print(f'打开{i}文件失败') + + # master使用 flag_copy = 0 if i.endswith('.rst'): for j in copy_list: diff --git a/docs/mindformers/docs/source_en/introduction/models.md b/docs/mindformers/docs/source_en/introduction/models.md index dec643604616a45c9d37fa5b3935b1ba274004b6..60fd8bef432252e6979335242c1a983c7a057c17 100644 --- a/docs/mindformers/docs/source_en/introduction/models.md +++ b/docs/mindformers/docs/source_en/introduction/models.md @@ -6,32 +6,32 @@ The following table lists models supported by MindSpore TransFormers. | Model | Specifications | Model Type | Latest Version | |:--------------------------------------------------------------------------------------------------------|:------------------------------|:----------------:|:--------------:| -| [DeepSeek-V3](https://gitee.com/mindspore/mindformers/blob/master/research/deepseek3) | 671B | Sparse LLM | 1.6.0 | +| [DeepSeek-V3](https://gitee.com/mindspore/mindformers/tree/master/research/deepseek3) | 671B | Sparse LLM | 1.6.0 | | [GLM4](https://gitee.com/mindspore/mindformers/blob/master/docs/model_cards/glm4.md) | 9B | Dense LLM | 1.6.0 | -| [Llama3.1](https://gitee.com/mindspore/mindformers/blob/master/research/llama3_1) | 8B/70B | Dense LLM | 1.6.0 | -| [Mixtral](https://gitee.com/mindspore/mindformers/blob/master/research/mixtral) | 8x7B | Sparse LLM | 1.6.0 | -| [Qwen2.5](https://gitee.com/mindspore/mindformers/blob/master/research/qwen2_5) | 0.5B/1.5B/7B/14B/32B/72B | Dense LLM | 1.6.0 | -| [TeleChat2](https://gitee.com/mindspore/mindformers/blob/master/research/telechat2) | 7B/35B/115B | Dense LLM | 1.6.0 | +| [Llama3.1](https://gitee.com/mindspore/mindformers/tree/master/research/llama3_1) | 8B/70B | Dense LLM | 1.6.0 | +| [Mixtral](https://gitee.com/mindspore/mindformers/tree/master/research/mixtral) | 8x7B | Sparse LLM | 1.6.0 | +| [Qwen2.5](https://gitee.com/mindspore/mindformers/tree/master/research/qwen2_5) | 0.5B/1.5B/7B/14B/32B/72B | Dense LLM | 1.6.0 | +| [TeleChat2](https://gitee.com/mindspore/mindformers/tree/master/research/telechat2) | 7B/35B/115B | Dense LLM | 1.6.0 | | [CodeLlama](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/codellama.md) | 34B | Dense LLM | 1.5.0 | | [CogVLM2-Image](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/cogvlm2_image.md) | 19B | MM | 1.5.0 | | [CogVLM2-Video](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/cogvlm2_video.md) | 13B | MM | 1.5.0 | -| [DeepSeek-V2](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/deepseek2) | 236B | Sparse LLM | 1.5.0 | -| [DeepSeek-Coder-V1.5](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/deepseek1_5) | 7B | Dense LLM | 1.5.0 | -| [DeepSeek-Coder](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/deepseek) | 33B | Dense LLM | 1.5.0 | -| [GLM3-32K](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/glm32k) | 6B | Dense LLM | 1.5.0 | +| [DeepSeek-V2](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/deepseek2) | 236B | Sparse LLM | 1.5.0 | +| [DeepSeek-Coder-V1.5](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/deepseek1_5) | 7B | Dense LLM | 1.5.0 | +| [DeepSeek-Coder](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/deepseek) | 33B | Dense LLM | 1.5.0 | +| [GLM3-32K](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/glm32k) | 6B | Dense LLM | 1.5.0 | | [GLM3](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/glm3.md) | 6B | Dense LLM | 1.5.0 | -| [InternLM2](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/internlm2) | 7B/20B | Dense LLM | 1.5.0 | +| [InternLM2](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/internlm2) | 7B/20B | Dense LLM | 1.5.0 | | [Llama3.2](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/llama3_2.md) | 3B | Dense LLM | 1.5.0 | | [Llama3.2-Vision](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/mllama.md) | 11B | MM | 1.5.0 | -| [Llama3](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/llama3) | 8B/70B | Dense LLM | 1.5.0 | +| [Llama3](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/llama3) | 8B/70B | Dense LLM | 1.5.0 | | [Llama2](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/llama2.md) | 7B/13B/70B | Dense LLM | 1.5.0 | -| [Qwen2](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/qwen2) | 0.5B/1.5B/7B/57B/57B-A14B/72B | Dense/Sparse LLM | 1.5.0 | -| [Qwen1.5](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/qwen1_5) | 7B/14B/72B | Dense LLM | 1.5.0 | -| [Qwen-VL](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/qwenvl) | 9.6B | MM | 1.5.0 | -| [TeleChat](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/telechat) | 7B/12B/52B | Dense LLM | 1.5.0 | +| [Qwen2](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/qwen2) | 0.5B/1.5B/7B/57B/57B-A14B/72B | Dense/Sparse LLM | 1.5.0 | +| [Qwen1.5](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/qwen1_5) | 7B/14B/72B | Dense LLM | 1.5.0 | +| [Qwen-VL](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/qwenvl) | 9.6B | MM | 1.5.0 | +| [TeleChat](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/telechat) | 7B/12B/52B | Dense LLM | 1.5.0 | | [Whisper](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/whisper.md) | 1.5B | MM | 1.5.0 | -| [Yi](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/yi) | 6B/34B | Dense LLM | 1.5.0 | -| [YiZhao](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/yizhao) | 12B | Dense LLM | 1.5.0 | +| [Yi](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/yi) | 6B/34B | Dense LLM | 1.5.0 | +| [YiZhao](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/yizhao) | 12B | Dense LLM | 1.5.0 | | [Baichuan2](https://gitee.com/mindspore/mindformers/blob/r1.3.0/research/baichuan2/baichuan2.md) | 7B/13B | Dense LLM | 1.3.2 | | [GLM2](https://gitee.com/mindspore/mindformers/blob/r1.3.0/docs/model_cards/glm2.md) | 6B | Dense LLM | 1.3.2 | | [GPT2](https://gitee.com/mindspore/mindformers/blob/r1.3.0/docs/model_cards/gpt2.md) | 124M/13B | Dense LLM | 1.3.2 | diff --git a/docs/mindformers/docs/source_zh_cn/introduction/models.md b/docs/mindformers/docs/source_zh_cn/introduction/models.md index a71e537a311577a44caf7b2c71ddf73bc74c0103..4cdfeaadce869149443b9084be5af9bfd69363ef 100644 --- a/docs/mindformers/docs/source_zh_cn/introduction/models.md +++ b/docs/mindformers/docs/source_zh_cn/introduction/models.md @@ -6,32 +6,32 @@ | 模型名 | 支持规格 | 模型类型 | 最新支持版本 | |:--------------------------------------------------------------------------------------------------------|:------------------------------------------|:-----------:|:------:| -| [DeepSeek-V3](https://gitee.com/mindspore/mindformers/blob/master/research/deepseek3) | 671B | 稀疏LLM | 1.6.0 | +| [DeepSeek-V3](https://gitee.com/mindspore/mindformers/tree/master/research/deepseek3) | 671B | 稀疏LLM | 1.6.0 | | [GLM4](https://gitee.com/mindspore/mindformers/blob/master/docs/model_cards/glm4.md) | 9B | 稠密LLM | 1.6.0 | -| [Llama3.1](https://gitee.com/mindspore/mindformers/blob/master/research/llama3_1) | 8B/70B | 稠密LLM | 1.6.0 | -| [Mixtral](https://gitee.com/mindspore/mindformers/blob/master/research/mixtral) | 8x7B | 稀疏LLM | 1.6.0 | -| [Qwen2.5](https://gitee.com/mindspore/mindformers/blob/master/research/qwen2_5) | 0.5B/1.5B/7B/14B/32B/72B | 稠密LLM | 1.6.0 | -| [TeleChat2](https://gitee.com/mindspore/mindformers/blob/master/research/telechat2) | 7B/35B/115B | 稠密LLM | 1.6.0 | +| [Llama3.1](https://gitee.com/mindspore/mindformers/tree/master/research/llama3_1) | 8B/70B | 稠密LLM | 1.6.0 | +| [Mixtral](https://gitee.com/mindspore/mindformers/tree/master/research/mixtral) | 8x7B | 稀疏LLM | 1.6.0 | +| [Qwen2.5](https://gitee.com/mindspore/mindformers/tree/master/research/qwen2_5) | 0.5B/1.5B/7B/14B/32B/72B | 稠密LLM | 1.6.0 | +| [TeleChat2](https://gitee.com/mindspore/mindformers/tree/master/research/telechat2) | 7B/35B/115B | 稠密LLM | 1.6.0 | | [CodeLlama](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/codellama.md) | 34B | 稠密LLM | 1.5.0 | | [CogVLM2-Image](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/cogvlm2_image.md) | 19B | MM | 1.5.0 | | [CogVLM2-Video](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/cogvlm2_video.md) | 13B | MM | 1.5.0 | -| [DeepSeek-V2](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/deepseek2) | 236B | 稀疏LLM | 1.5.0 | -| [DeepSeek-Coder-V1.5](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/deepseek1_5) | 7B | 稠密LLM | 1.5.0 | -| [DeepSeek-Coder](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/deepseek) | 33B | 稠密LLM | 1.5.0 | -| [GLM3-32K](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/glm32k) | 6B | 稠密LLM | 1.5.0 | +| [DeepSeek-V2](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/deepseek2) | 236B | 稀疏LLM | 1.5.0 | +| [DeepSeek-Coder-V1.5](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/deepseek1_5) | 7B | 稠密LLM | 1.5.0 | +| [DeepSeek-Coder](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/deepseek) | 33B | 稠密LLM | 1.5.0 | +| [GLM3-32K](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/glm32k) | 6B | 稠密LLM | 1.5.0 | | [GLM3](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/glm3.md) | 6B | 稠密LLM | 1.5.0 | -| [InternLM2](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/internlm2) | 7B/20B | 稠密LLM | 1.5.0 | +| [InternLM2](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/internlm2) | 7B/20B | 稠密LLM | 1.5.0 | | [Llama3.2](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/llama3_2.md) | 3B | 稠密LLM | 1.5.0 | | [Llama3.2-Vision](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/mllama.md) | 11B | MM | 1.5.0 | -| [Llama3](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/llama3) | 8B/70B | 稠密LLM | 1.5.0 | +| [Llama3](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/llama3) | 8B/70B | 稠密LLM | 1.5.0 | | [Llama2](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/llama2.md) | 7B/13B/70B | 稠密LLM | 1.5.0 | -| [Qwen2](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/qwen2) | 0.5B/1.5B/7B/57B/57B-A14B/72B | 稠密/稀疏LLM | 1.5.0 | -| [Qwen1.5](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/qwen1_5) | 7B/14B/72B | 稠密LLM | 1.5.0 | -| [Qwen-VL](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/qwenvl) | 9.6B | MM | 1.5.0 | -| [TeleChat](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/telechat) | 7B/12B/52B | 稠密LLM | 1.5.0 | +| [Qwen2](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/qwen2) | 0.5B/1.5B/7B/57B/57B-A14B/72B | 稠密/稀疏LLM | 1.5.0 | +| [Qwen1.5](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/qwen1_5) | 7B/14B/72B | 稠密LLM | 1.5.0 | +| [Qwen-VL](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/qwenvl) | 9.6B | MM | 1.5.0 | +| [TeleChat](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/telechat) | 7B/12B/52B | 稠密LLM | 1.5.0 | | [Whisper](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/whisper.md) | 1.5B | MM | 1.5.0 | -| [Yi](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/yi) | 6B/34B | 稠密LLM | 1.5.0 | -| [YiZhao](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/yizhao) | 12B | 稠密LLM | 1.5.0 | +| [Yi](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/yi) | 6B/34B | 稠密LLM | 1.5.0 | +| [YiZhao](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/yizhao) | 12B | 稠密LLM | 1.5.0 | | [Baichuan2](https://gitee.com/mindspore/mindformers/blob/r1.3.0/research/baichuan2/baichuan2.md) | 7B/13B | 稠密LLM | 1.3.2 | | [GLM2](https://gitee.com/mindspore/mindformers/blob/r1.3.0/docs/model_cards/glm2.md) | 6B | 稠密LLM | 1.3.2 | | [GPT2](https://gitee.com/mindspore/mindformers/blob/r1.3.0/docs/model_cards/gpt2.md) | 124M/13B | 稠密LLM | 1.3.2 | diff --git a/docs/mindspore/_ext/generate_ops_mint_rst.py b/docs/mindspore/_ext/generate_ops_mint_rst.py index 1df70dc41881286c6da00a21ae0ebdc354b80fab..17b8d65a2020577dcb0a1634284c2eff1af7ceae 100644 --- a/docs/mindspore/_ext/generate_ops_mint_rst.py +++ b/docs/mindspore/_ext/generate_ops_mint_rst.py @@ -42,6 +42,7 @@ def generate_ops_mint_rst(repo_path, ops_path, mint_path, pr_need='all'): two_p = [i[1] for i in one_p] # 从导入处获取接口名,分为直接导入接口名,as别名,import导入多个 for i in two_p: + # as别名 if ' as ' in i: name1 = re.findall('(.*?) as (.*)', i)[0][0] name2 = re.findall('(.*?) as (.*)', i)[0][1] @@ -49,8 +50,10 @@ def generate_ops_mint_rst(repo_path, ops_path, mint_path, pr_need='all'): mint_ops_dict[modulename].append([name1, name2]) else: continue + # 直接导入接口名 elif i in reg_all: mint_ops_dict[modulename].append(i) + # import导入多个 else: for j in i.split(','): if j.strip() in reg_all: diff --git a/docs/mindspore/_ext/myautosummary.py b/docs/mindspore/_ext/myautosummary.py index bbb05df75aa3e520011f8f7edb5d800dea214648..bca7ac0714c027c972abaac1eeccf42c1da7600b 100644 --- a/docs/mindspore/_ext/myautosummary.py +++ b/docs/mindspore/_ext/myautosummary.py @@ -3,6 +3,7 @@ import os import re import inspect import importlib +from functools import reduce from typing import List, Tuple from docutils.nodes import Node from sphinx.locale import __ @@ -203,6 +204,7 @@ class MsAutosummary(Autosummary): except: display_name_path = "" if 'mindspore/ops/auto_generate/' in display_name_path: + # 处理注释是代码块对比的ops.primitive接口 env_sum = self.get_refer_platform(display_name) summary = self.extract_ops_summary(self.bridge.result.data[:]) if not summary: @@ -492,7 +494,7 @@ class MsCnAutoSummary(Autosummary): max_item_chars = 50 origin_rst_files = self.env.config.rst_files all_rst_files = self.env.found_docs - generated_files = all_rst_files.difference(origin_rst_files) + generated_en_files = all_rst_files.difference(origin_rst_files) for name in names: display_name = name @@ -503,7 +505,7 @@ class MsCnAutoSummary(Autosummary): dir_name = self.options['toctree'] file_path = os.path.join(doc_path, dir_name, display_name+'.rst') spec_path = os.path.join('api_python', dir_name, display_name) - if os.path.exists(file_path) and spec_path not in generated_files: + if os.path.exists(file_path) and spec_path not in generated_en_files: summary_re_tag = re.compile( rf'\.\. \w+:\w+::\s+{display_name}.*?\n\s+:.*?:\n\n\s+((?:.|\n|)+?)(\n\n|。)') summary_re_wrap = re.compile(rf'\.\. \w+:\w+::\s+{display_name}(?:.|\n|)+?\n\n\s+((?:.|\n|)+?)(\n\n|。)') @@ -743,17 +745,19 @@ class MsCnAutoSummary(Autosummary): return [table_spec, table] def get_api(fullname): - """Get the api module.""" + """ + 获取接口对象。 + + :param fullname: 接口名全称 + :return: 属性对象或None(如果不存在) + """ + main_module = fullname.split('.')[0] + main_import = importlib.import_module(main_module) + try: - module_name, api_name = ".".join(fullname.split('.')[:-1]), fullname.split('.')[-1] - # pylint: disable=unused-variable - module_import = importlib.import_module(module_name) - except ModuleNotFoundError: - module_name, api_name = ".".join(fullname.split('.')[:-2]), ".".join(fullname.split('.')[-2:]) - module_import = importlib.import_module(module_name) - # pylint: disable=eval-used - api = getattr(module_import, api_name, '') - return api + return reduce(getattr, fullname.split('.')[1:], main_import) + except AttributeError: + return None class MsCnPlatformAutoSummary(MsCnAutoSummary): """definition of mscnplatformautosummary.""" diff --git a/docs/mindspore/_ext/overwrite_autodoc.txt b/docs/mindspore/_ext/overwrite_autodoc.txt index d4f1513ab500aa37b55ce218267004a46251da32..acbad745367b0a711c5bfe9d3f3c4c06bf7c9bd4 100644 --- a/docs/mindspore/_ext/overwrite_autodoc.txt +++ b/docs/mindspore/_ext/overwrite_autodoc.txt @@ -1788,8 +1788,11 @@ class ClassDocumenter(DocstringSignatureMixin, ModuleLevelDocumenter): # type: spec_tp = [('nn.HShrink', 'mint.nn.Hardshrink'), ('nn.SoftShrink', 'mint.nn.Softshrink'), ('nn.HSigmoid', 'mint.nn.Hardsigmoid'), ('nn.HSwish', 'mint.nn.Hardswish')] if docstrings and '.mint.' in self.fullname: + # mint.xxx 中的 xxx b_name = self.fullname.split('.')[-1] + # mint.xxx usename = self.fullname.replace('mindspore.', '') + # 获取接口源文件路径 try: py_source_rel = get_full_modname(self.modname, b_name).replace('.', '/') + '.py' except: diff --git a/docs/mindspore/source_en/conf.py b/docs/mindspore/source_en/conf.py index 2f8f1dfbe79b1b9bc99be4a6485b64dadfd46151..198c99a2f2543c2c9cb18300a52bbe0e89a3100f 100644 --- a/docs/mindspore/source_en/conf.py +++ b/docs/mindspore/source_en/conf.py @@ -223,32 +223,33 @@ with open(autodoc_source_path, "r", encoding="utf8") as f: exec(code_str, sphinx_autodoc.__dict__) # Repair error decorators defined in mindspore. -try: - decorator_list = [("mindspore/common/_decorator.py", "deprecated", - " def decorate(func):", - " def decorate(func):\n\n import functools\n\n @functools.wraps(func)"), - ("mindspore/nn/optim/optimizer.py", "deprecated", - "def opt_init_args_register(fn):\n \"\"\"Register optimizer init args.\"\"\"\n", - "def opt_init_args_register(fn):\n \"\"\"Register optimizer init args.\"\"\"\n\n import functools\n\n @functools.wraps(fn)"), - ("mindspore/log.py", "deprecated", - " def __call__(self, func):\n", - " def __call__(self, func):\n import functools\n\n @functools.wraps(func)\n"), - ("mindspore/ops/primitive.py", "fix for `shard`", - " @_LogActionOnce(logger=logger, key='Primitive')", " # The decorator has been deleted."), - ("mindspore/dataset/engine/datasets.py","generate api", - " @deprecated(\"1.5\")"," # The decorator has been deleted(id1)."), - ("mindspore/dataset/engine/datasets.py","generate api", - " @check_bucket_batch_by_length"," # The decorator has been deleted(id2)."), - ("mindspore/train/summary/summary_record.py", "summary_record", - " value (Union[Tensor, GraphProto, TrainLineage, EvaluationLineage, DatasetGraph, UserDefinedInfo,\n LossLandscape]): The value to store.\n\n", - " value (Union[Tensor, GraphProto, TrainLineage, EvaluationLineage, DatasetGraph, UserDefinedInfo, LossLandscape]): The value to store.\n\n"), - ("mindspore/nn/cell.py","generate api", - " @jit_forbidden_register"," # generate api by del decorator."), - ("mindspore/profiler/dynamic_profiler.py","generate api", - " @no_exception_func()"," # generate api by del decorator.")] - - base_path = os.path.dirname(os.path.dirname(sphinx.__file__)) - for i in decorator_list: + +decorator_list = [("mindspore/common/_decorator.py", "deprecated", + " def decorate(func):", + " def decorate(func):\n\n import functools\n\n @functools.wraps(func)"), + ("mindspore/nn/optim/optimizer.py", "deprecated", + "def opt_init_args_register(fn):\n \"\"\"Register optimizer init args.\"\"\"\n", + "def opt_init_args_register(fn):\n \"\"\"Register optimizer init args.\"\"\"\n\n import functools\n\n @functools.wraps(fn)"), + ("mindspore/log.py", "deprecated", + " def __call__(self, func):\n", + " def __call__(self, func):\n import functools\n\n @functools.wraps(func)\n"), + ("mindspore/ops/primitive.py", "fix for `shard`", + " @_LogActionOnce(logger=logger, key='Primitive')", " # The decorator has been deleted."), + ("mindspore/dataset/engine/datasets.py","generate api", + " @deprecated(\"1.5\")"," # The decorator has been deleted(id1)."), + ("mindspore/dataset/engine/datasets.py","generate api", + " @check_bucket_batch_by_length"," # The decorator has been deleted(id2)."), + ("mindspore/train/summary/summary_record.py", "summary_record", + " value (Union[Tensor, GraphProto, TrainLineage, EvaluationLineage, DatasetGraph, UserDefinedInfo,\n LossLandscape]): The value to store.\n\n", + " value (Union[Tensor, GraphProto, TrainLineage, EvaluationLineage, DatasetGraph, UserDefinedInfo, LossLandscape]): The value to store.\n\n"), + ("mindspore/nn/cell.py","generate api", + " @jit_forbidden_register"," # generate api by del decorator."), + ("mindspore/profiler/dynamic_profiler.py","generate api", + " @no_exception_func()"," # generate api by del decorator.")] + +base_path = os.path.dirname(os.path.dirname(sphinx.__file__)) +for i in decorator_list: + try: with open(os.path.join(base_path, os.path.normpath(i[0])), "r+", encoding="utf8") as f: content = f.read() if i[3] not in content: @@ -256,18 +257,18 @@ try: f.seek(0) f.truncate() f.write(content) -except: - print('mindspore替换安装包内容失败') + except: + print(f'替换{i[0]}下内容失败') # Repair error content defined in mindspore. -try: - decorator_list = [("mindspore/common/dtype.py","del decorator", - "@enum.unique","# generate api by del decorator."), - ("mindspore/common/dtype.py","del class", - "class QuantDtype(enum.Enum):","class QuantDtype():")] +decorator_list = [("mindspore/common/dtype.py","del decorator", + "@enum.unique","# generate api by del decorator."), + ("mindspore/common/dtype.py","del class", + "class QuantDtype(enum.Enum):","class QuantDtype():") + ] - base_path = os.path.dirname(os.path.dirname(sphinx.__file__)) - for i in decorator_list: +for i in decorator_list: + try: with open(os.path.join(base_path, os.path.normpath(i[0])), "r+", encoding="utf8") as f: content = f.read() if i[2] in content: @@ -275,14 +276,13 @@ try: f.seek(0) f.truncate() f.write(content) -except: - print('mindspore删除安装包装饰器内容失败') + except: + print(f'替换{i[0]}下内容失败') # add @functools.wraps try: decorator_list = [("mindspore/common/_tensor_overload.py", ".*?_mint")] - base_path = os.path.dirname(os.path.dirname(sphinx.__file__)) for i in decorator_list: with open(os.path.join(base_path, os.path.normpath(i[0])), "r+", encoding="utf8") as f: content = f.read() @@ -299,6 +299,28 @@ except: sys.path.append(os.path.abspath('../../../resource/search')) import search_code +# 发版本时这里启用 +# re_url = r"(((gitee.com/mindspore/docs/mindspore-lite)|(github.com/mindspore-ai/(mindspore|docs))|" + \ +# r"(mindspore.cn/(docs|tutorials|lite))|(obs.dualstack.cn-north-4.myhuaweicloud)|" + \ +# r"(mindspore-website.obs.cn-north-4.myhuaweicloud))[\w\d/_.-]*?)/(master)" + +# re_url2 = r"(gitee.com/mindspore/mindspore[\w\d/_.-]*?)/(master)" + +# re_url4 = r"(((gitee.com/mindspore/mindformers)|(mindspore.cn/mindformers))[\w\d/_.-]*?)/(dev)" + +# for cur, _, files in os.walk(os.path.join(base_path, 'mindspore')): +# for i in files: +# if i.endswith('.py'): +# with open(os.path.join(cur, i), 'r+', encoding='utf-8') as f: +# content = f.read() +# new_content = re.sub(re_url, r'\1/r2.7.0rc1', content) +# new_content = re.sub(re_url2, r'\1/v2.7.0-rc1', new_content) +# new_content = re.sub(re_url4, r'\1/r1.6.0', new_content) +# if new_content != content: +# f.seek(0) +# f.truncate() +# f.write(new_content) + # Copy source files of en python api from mindspore repository. copy_path = 'docs/api/api_python_en' repo_path = os.getenv("MS_PATH") @@ -340,7 +362,7 @@ if os.path.exists(dataset_list_path): def ops_interface_name(): src_target_path = os.path.join(src_dir_en, 'mindspore.ops.primitive.rst') - with open(src_target_path,'r+',encoding='utf8') as f: + with open(src_target_path,'r',encoding='utf8') as f: content = f.read() primi_list = re.findall(" (mindspore\.ops\.\w*?)\n", content) @@ -349,7 +371,7 @@ def ops_interface_name(): def mint_interface_name(): mint_p = 'mindspore.mint.rst' src_target_path = os.path.join(src_dir_en, mint_p) - with open(src_target_path,'r+',encoding='utf8') as f: + with open(src_target_path,'r',encoding='utf8') as f: content = f.read() mint_list = re.findall(r" (mindspore\.mint\..*)\n", content+'\n') @@ -420,13 +442,23 @@ for cur, _, files in os.walk(des_sir): for i in files: if os.path.join(cur, i) in no_viewsource_list: continue - if i.endswith('.md'): + if i.endswith('.rst') or i.endswith('.md') or i.endswith('.ipynb'): with open(os.path.join(cur, i), 'r+', encoding='utf-8') as f: content = f.read() new_content = content - md_view = f'[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/{docs_branch}/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/{copy_repo}/blob/{branch}/' + copy_path + cur.split('api_python')[-1] + '/' + i + ')\n\n' - if 'resource/_static/logo_source' not in new_content: - new_content = re.sub('(# .*\n\n)', r'\1'+ md_view, new_content, 1) + + # 发版本时这里启用 + # new_content = re.sub(re_url, r'\1/r2.7.0rc1', new_content) + # new_content = re.sub(re_url4, r'\1/r1.6.0', new_content) + # if i.endswith('.rst'): + # new_content = re.sub(re_url2, r'\1/v2.7.0-rc1', new_content) + + # master使用 + if i.endswith('.md'): + md_view = f'[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/{docs_branch}/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/{copy_repo}/blob/{branch}/' + copy_path + cur.split('api_python')[-1] + '/' + i + ')\n\n' + if 'resource/_static/logo_source' not in new_content: + new_content = re.sub('(# .*\n\n)', r'\1'+ md_view, new_content, 1) + if new_content != content: f.seek(0) f.truncate() @@ -614,5 +646,8 @@ else: content = content[0] with open(des_release, "w", encoding="utf-8") as p: + # 发版本时这里启用 + # content = re.sub(re_url, r'\1/r2.7.0rc1', content) + # content = re.sub(re_url2, r'\1/v2.7.0-rc1', content) p.write("# Release Notes" + "\n\n" + release_source) p.write(content) diff --git a/docs/mindspore/source_en/faq/network_compilation.md b/docs/mindspore/source_en/faq/network_compilation.md index ea6b8e474722936f52ca12a42a390bf26f479a36..2c27b842d5b661748a3330868a9f396d234cd7eb 100644 --- a/docs/mindspore/source_en/faq/network_compilation.md +++ b/docs/mindspore/source_en/faq/network_compilation.md @@ -413,8 +413,7 @@ A: The "External" type indicates that an object that cannot be natively supporte ## Q: What can I do if an error `Nested execution during JIT execution for 'xxx' is not supported when 'xxx' compile and execute.` is reported? -A: When the compilation process is triggered, that is, when the code is compiled into a static computational diagram -, see [Graph Mode Execution Principle](https://www.mindspore.cn/docs/en/master/features/program_form/overview.html), using the JIT Fallback feature by default, the above exception will be thrown when entering the compilation process again. +A: When the compilation process is triggered, that is, when the code is compiled into a static computational diagram, using the JIT Fallback feature by default, the above exception will be thrown when entering the compilation process again. Taking JIT Fallback support for calling objects and methods from third-party libraries as an example: diff --git a/docs/mindspore/source_en/features/compile/compilation_guide.md b/docs/mindspore/source_en/features/compile/compilation_guide.md new file mode 100644 index 0000000000000000000000000000000000000000..e77411ba7f99969b5639870715a72e63e396cd14 --- /dev/null +++ b/docs/mindspore/source_en/features/compile/compilation_guide.md @@ -0,0 +1,459 @@ +# mindspore.jit Multi-Level Compilation Optimization + +[![View Source](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/master/docs/mindspore/source_en/features/compile/compilation_guide.md) + +## MindSpore Compilation Architecture + +MindSpore utilizes jit (just-in-time) for performance optimization. The jit mode converts Python code to intermediate representation graphs (IR, Intermediate Representation) through AST tree parsing, Python bytecode parsing, or code execution tracing. We name it MindIR. The compiler optimizes this IR graph to achieve code optimization and improve runtime performance. In contrast to dynamic graph mode, this JIT compilation mode is called graph mode. + +Python code written by developers runs in dynamic graph mode by default. Functions can be decorated with the @mindspore.jit decorator to specify execution in graph mode. For documentation on the @mindspore.jit decorator, please refer to the [jit documentation](https://www.mindspore.cn/docs/en/master/api_python/mindspore/mindspore.jit.html). + +Graph mode is roughly divided into 3 stages: + - Graph Capture (Graph Construction): python code -> MindIR. + - Graph Optimization (Frontend): Hardware-independent optimization of MindIR, algebraic simplification, function inlining, redundancy elimination, etc. + - Graph Optimization (Backend): Hardware-dependent optimization of MindIR, LazyInline, operator selection, graph-operator fusion, etc. + +## Graph Capture (Graph Construction) + +MindSpore provides three capture methods as follows: + - AST: Converts executed functions to IR graphs through AST tree parsing + - bytecode (experimental): Parses Python bytecode to construct IR graphs as much as possible. Parts that cannot be converted to IR graphs will be executed according to dynamic graph + - trace (experimental): Constructs IR graphs by tracing the execution trajectory of Python code + + + Taking ast as an example: developers can choose `@mindspore.jit:(capture_mode="ast")` decorator to modify functions. Functions modified with ast mode have certain syntax restrictions. We provide two modes for developers to choose from. + + - strict mode: The goal of this mode is to construct a single graph. If the developer's Python code cannot construct a graph, choosing this mode will cause an error when running the program, requiring the developer to modify the code to use graphable syntax. This is suitable for developers pursuing performance. + - lax mode: The goal of this mode is to make the developer's program runnable as much as possible. The idea is to perform Python fallback for code that cannot construct graphs in strict mode, that is, return to the Python layer for execution. + +For graph mode constraints, please refer to [Syntax Constraints](https://www.mindspore.cn/tutorials/en/master/compile/static_graph.html). Here's an example of how ast parses Python code and constructs graphs: + +```python +@mindspore.jit +def foo(x, y): + z = x + y + return z +``` + +The corresponding abstract syntax tree is as follows: + +![Abstract Syntax Tree](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/docs/mindspore/source_zh_cn/features/compile/images/ast.png) + +By parsing the above abstract syntax tree, we obtain the following IR: + +```text +%para1_x: +%para2_y: + +subgraph instance: foo +subgraph @foo() { + %0(CNode_17) = PrimFunc_Add(%para1_x, %para2_y) + : (, ) -> () + Return(%0) + : () +} +``` + +**Advantages of ast**: + + - Using ast mode gives users stronger programming autonomy and more precise performance optimization. They can tune network performance to optimal based on function characteristics and usage experience. + +**Limitations of ast**: + + - Functions decorated with ast must strictly follow static graph syntax for internal programming. + +**recommendations for ast mode**: + - Compared to dynamic graph execution, functions decorated with `@mindspore.jit` need to consume certain time for compilation on the first call. In subsequent calls to this function, if the original compilation result can be reused, the original compilation result will be used directly for execution. Therefore, using the `@mindspore.jit` decorator to modify functions that will be executed multiple times usually obtains more performance benefits. + + - The runtime efficiency advantage of graph mode is reflected in its global compilation optimization of functions decorated with `@mindspore.jit`. The more operations contained in the function, the greater the optimization space. Therefore, functions decorated with `@mindspore.jit` are best large code blocks containing many operations, rather than many fragmented functions containing only a few operations separately marked with jit tags. Otherwise, it may lead to no performance benefits or even degradation. + + - Most calculations and optimizations are based on optimization of Tensor calculations. It is recommended that decorated functions should be used for real data calculation functions, rather than simple scalar calculations or data structure transformations. + + - For functions decorated with `@mindspore.jit`, if their inputs contain constants, changes in input values each time will cause recompilation. For the concept of variable constants, please refer to [Constants and Variables in Just-in-Time Compilation](https://www.mindspore.cn/tutorials/en/master/compile/static_graph.html). Therefore, it is recommended that decorated functions take Tensors or data modified by Mutable as input to avoid additional performance loss caused by multiple compilations. + +## Graph Optimization (Frontend) + +Similar to traditional compilation optimization techniques, compilation optimization in MindSpore is also completed through individual Passes. Each Pass takes the MindIR produced by the previous Pass as input, and after optimization by this Pass, produces a new MindIR representation as output. A large Pass can contain multiple small Passes, each small Pass is only responsible for single-point compilation optimization, such as: algebraic simplification, function inlining, redundancy elimination, etc. The optimization result produced by one Pass may bring optimization opportunities for other Passes, so these Passes can be run in cycles until the produced MindIR no longer changes. + +There are many frontend compilation optimization techniques, such as: algebraic simplification, function inlining, redundancy elimination, etc. Here we only introduce representative compilation optimization techniques. + +### 1 Algebraic Simplification + +In traditional compilers, algebraic simplification is a compiler optimization technique aimed at simplifying algebraic expressions in source code, eliminating redundant calculations, improving program execution efficiency, and reducing memory usage. +For example, in the following code snippet: + +```cpp +int a = x * 1; +int b = x + 0; +int c = x * 0 + y * 1; +``` + +Traditional compilers perform equivalent replacement of identified expressions according to algebraic rules and identities. Common algebraic rules include associative law, commutative law, and distributive law, etc. The compiler tries to replace expressions with simpler forms as much as possible. Optimization is performed through analysis of AST (Abstract Syntax Tree) or SSA (Static Single Assignment), identifying and simplifying code to: + +```cpp +a = x; +b = x; +c = y; +``` + +In the MindSpore compiler, the principle of algebraic simplification is different from traditional compilers. It processes computation graphs rather than traditional control flow graphs, by adjusting the execution order of operators in the computation graph, or deleting unnecessary operators, to maintain the simplicity of the computation graph and improve computational efficiency. + +For example, in the following Python code snippet: + +```python +import numpy as np +import mindspore + +@mindspore.jit +def func(x): + return x + 0 + +m = mindspore.tensor(np.array([[1, 2, 3], [4, 5, 6]]).astype(np.int32)) +out = func(m) +``` + +The MindSpore graph compiler will convert the Python program to a computation graph, which consists of multiple subgraphs. Algebraic operations in the source program are converted to operator calls within subgraphs. You can see that the PrimFunc_Add operator is called once. + + +```text +%para1_x: + +subgraph @1_func_14() { + %0(CNode_7) = PrimFunc_Add(%para1_x, Tensor(shape=[], dtype=Int32, value=0)) + : (, ) -> () + + Return(%0) + : () +} +``` + +Through algebraic simplification, the PrimFunc_Add operator can be directly deleted, simplifying the computation graph structure, and simplifying x + 0 to x. + + +```text +%para1_x: + +subgraph @1_func_14() { + Return(%para1_x) + : () +} +``` + +Algebraic simplification can involve more modifications to the computation graph structure. It is usually combined with other compiler optimization techniques (such as constant folding, constant propagation, etc.) to jointly improve program performance. + +### 2 Function Inlining + +In traditional compilers, inlining is an optimization technique that can directly replace the code of called functions at the location where the function is called, improving program execution efficiency. Suppose we have a C++ function add for summing two numbers: + +```cpp +int add(int a, int b) { + return a + b; +} + +int main() { + int x = add(3, 5); + int y = add(x, 10); + return y; +} +``` + + +The compiler inlines the function body directly to the call site, which eliminates the overhead of function calls and creates conditions for subsequent optimizations (such as eliminating redundant calculations 3 + 5, directly evaluating and replacing at compile time). This idea of replacing calls with code is the core of inlining. + +```cpp +int main() { + int x = 3 + 5; // Replace first call + int y = x + 10; // Replace second call + return y; +} +``` + +In AI framework computation graph compilers, the goal of inlining is similar, but the operation object changes from "functions" to "subgraphs". Suppose we have a Python program: + +```python +from mindspore + +def f2(x: mindspore.Tensor, y: mindspore.Tensor): + return x * 0.5 + y + +@mindspore.jit +def f1(a: mindspore.Tensor, b: mindspore.Tensor, c: mindspore.Tensor): + x = f2(a, b) + y = f2(a, c) + return x + y + +# Create 3 random value Tensors with shape=(2, 4) +a = mindspore.ops.randn(2, 4) +b = mindspore.ops.randn(2, 4) +c = mindspore.ops.randn(2, 4) +out = f1(a, b, c) +``` + + +First, MindSpore's computation graph compiler will convert the Python program to a computation graph. Function calls in the Python program will be converted to calls between computation graphs, resulting in an original computation graph similar to the following. Among them, the main graph f1 calls the subgraph f2 twice. + + +```text +# Params: +%para1_a: +%para2_b: +%para3_c: + +subgraph @f2(%para1_x, %para2_y) { + %0 = PrimFunc_Mul(%para1_x, Float32(0.5)) + + %1 = PrimFunc_Add(%0, %para2_y) + + Return(%1) +} + +subgraph @f1() { + %0(x) = call @f2(%para1_a, %para2_b) # Call subgraph f2 + + %1(y) = call @f2(%para1_a, %para3_c) # Call subgraph f2 + + %2 = PrimFunc_Add(%0, %1) + + Return(%2) +} +``` + +Through inlining, the subgraph f2 can be expanded and merged into the main graph f1. + +```text +subgraph @f1() { + # First subgraph inlining + %0 = PrimFunc_Mul(%para1_a, Float32(0.5)) # Repeated calculation step + %1 = PrimFunc_Add(%0, %para2_b) + + # Second subgraph inlining + %2 = PrimFunc_Mul(%para1_a, Float32(0.5)) # Repeated calculation step + %3 = PrimFunc_Add(%2, %para3_c) + + %4 = PrimFunc_Add(%1, %3) + + Return(%4) +} +``` + +Before inlining expands the subgraph, the compiler may not be able to identify the repeated operations in the two calls to subgraph f2 (at this time the subgraph is usually treated as a black box). After inlining expands the subgraph, the compiler can clearly see that x * 0.5 is calculated twice, which can trigger further optimization by the compiler: Common Subexpression Elimination (CSE), thus reducing the amount of calculation. + +```text +subgraph @f1() { + %0 = PrimFunc_Mul(%para1_a, Float32(0.5)) # CSE merges repeated calculations + + %1 = PrimFunc_Add(%0, %para2_b) + + %2 = PrimFunc_Add(%0, %para3_c) # Directly reuse %0 + + %3 = PrimFunc_Add(%1, %2) + + Return(%3) +} +``` + +By inlining to expand subgraphs, the compiler can more clearly identify cross-subgraph optimization opportunities. In addition to Common Subexpression Elimination (CSE), it can also trigger many optimization measures such as operator fusion and memory management. Therefore, inlining is an important optimization mechanism in computation graph compilers and the foundation for many cross-graph optimizations. + +### 3 Redundancy Elimination + +In traditional compilers, redundancy elimination includes various compilation optimization techniques aimed at identifying redundant parts in code during compilation and eliminating them to reduce unnecessary calculations and improve program execution efficiency. + +Usually redundant code may be intentionally written by users for readability purposes, or it may just be an unintentional act during the coding process. In addition, intermediate results produced by the compilation optimization process itself through other optimization techniques (such as: algebraic simplification, inlining, common subexpression elimination, etc.) may also bring opportunities for redundancy elimination. + +The purpose and techniques used in MindSpore redundancy elimination are similar to traditional compilers. The difference is that these redundancy optimizations are completed on MindIR. For example: + +1. **Dead Code Elimination** + + Suppose there is Python code with redundant calculations as follows: + + ```python + import mindspore + + @mindspore.jit + def func(x, y): + a = x + y + b = x - y + c = x * y # Dead code + d = a / b + return d + + x = mindspore.tensor(20, mindspore.float32) + y = mindspore.tensor(10, mindspore.float32) + out = func(x, y) + ``` + +The MindSpore graph compiler will convert Python code decorated with `@mindspore.jit` to MindIR representation through static analysis and eliminate the redundant calculation of c = x * y. The final generated MindIR is as follows: + + ```text + # Params: + %para1_x: + %para2_y: + + subgraph @func_1() { + %0(a) = PrimFunc_Add(%para1_x, %para2_y) + : (, ) -> () + %1(b) = PrimFunc_Sub(%para1_x, %para2_y) + : (, ) -> () + %2(d) = PrimFunc_Div(%0, %1) + : (, ) -> () + Return(%2) + : () + } + ``` +2. **Unreachable Code Elimination** + + Suppose there is Python code with unreachable paths as follows: + + ```python + import mindspore + + @mindspore.jit + def func(x, y): + a = x + y + if 1 < 0: # Unreachable branch + b = x + y + else: + b = x - y + d = a / b + return d + + x = mindspore.tensor(20, mindspore.float32) + y = mindspore.tensor(10, mindspore.float32) + out = func(x, y) + ``` + +The MindSpore graph compiler will convert Python code decorated with `@mindspore.jit` to MindIR representation through static analysis and eliminate the redundant control flow branch code of `1 < 0`. The final generated MindIR is as follows: + + ```text + # Params: + %para1_x: + %para2_y: + + subgraph @func_1() { + %0(a) = PrimFunc_Add(%para1_x, %para2_y) + : (, ) -> () + %1(b) = PrimFunc_Sub(%para1_x, %para2_y) + : (, ) -> () + %2(d) = PrimFunc_Div(%0, %1) + : (, ) -> () + Return(%2) cnode_attrs: {checkpoint: Bool(1)} + : () + } + ``` + +Redundancy elimination plays an important role in compilation optimization. Without changing the original semantics of the program, it can significantly improve program execution efficiency and save computational resources by reducing unnecessary runtime calculations. Redundancy elimination is usually combined with other compilation optimization techniques to obtain more opportunities for eliminating redundant code. + +## Graph Optimization (Backend) +After the MindIR graph completes frontend optimization, it needs further optimization (including target hardware). The optimization modes are divided into O0 and O1, represented by the parameter jit_level: + - **jit_level=O0**: Only performs basic graph segmentation optimization and operator selection (hardware-related). The advantage is that it can guarantee the original structure of the IR graph and has faster compilation speed. + - **jit_level=O1**: Adds graph optimization and automatic operator fusion. Compilation performance is somewhat lost, but after the model starts training, efficiency is higher. + +After this round of optimization, MindIR will be executed by the runtime module, involving multi-level pipeline concurrency and other technologies. For reference, see [Multi-Level Pipeline]. + +### jit_level=O0 Mode + +O0 mode has fewer optimizations. The basic optimizations are mainly backend LazyInline and No-task node execution optimization. + + - ***LazyInline**: The main idea is to postpone the overhead of function calls to when they are actually needed, which can reduce compilation overhead and improve compilation efficiency. LazyInline reuses the same subgraph structure during the graph compilation phase without expanding it in the graph, avoiding large graph scale affecting compilation performance. + + ![jit_level_lazyinline](./images/multi_level_compilation/jit_level_lazyinline.png) + + - **No-task node Execution Optimization**: No-task nodes refer to operators such as Reshape, ExpandDims, Squeeze, Flatten, FlattenGrad, Reformat, etc. These operators have no computational logic, do not modify memory layout, and only modify shape, format and other information. At the end of graph compilation, No-task nodes are converted to ref nodes, where the output has the same address as the input, and kernel launch is skipped during execution to achieve execution performance optimization. + + ![jit_level_no_task](./images/multi_level_compilation/jit_level_no_task.png) + +#### Operator Selection + +Operators are the basic execution units in deep learning frameworks. They are responsible for performing specific computational tasks such as matrix multiplication, convolution, pooling, etc. Operator selection requires comprehensive consideration of factors such as operator type, data type, hardware platform, and operator optimization to select the optimal operator for achieving the highest model runtime efficiency. + +MindSpore's operator types on Ascend hardware are aclnn kernel/aclop kernel/hccl kernel/cpu kernel. The operator selection process is shown in the following figure: + +![jit_level_kernelselect](./images/multi_level_compilation/jit_level_kernelselect.png) + +1. Operator type: First, according to the operator type, choose whether it is a computational operator or communication operator. +2. Hardware platform: If there is a corresponding operator on the hardware, the operator on the hardware is preferred, otherwise the operator on CPU is chosen (heterogeneous). For example, shape-related computational operators may only be suitable to be supported on CPU, and there is no corresponding hardware operator. +3. Operator efficiency: Due to the better performance of aclnn operators on Ascend hardware, computational operators will prefer aclnn kernel if there is a corresponding aclnn kernel, otherwise aclop kernel will be chosen. +4. If no operator is selected in any of the above 3 steps, it is an unsupported operator and operator selection fails with an error. + +#### Execution Order Scheduling + +Different graph traversal algorithms produce execution orders with large differences in execution performance and memory, as shown in the figure: + +![jit_level_exec_order](./images/multi_level_compilation/jit_level_exec_order.png) + + - **Execution order obtained by BFS**: kernel1-> kernel2-> kernel4-> kernel5-> kernel3-> kernel6, memory peaks at 5G (kernel3 can release kernel1 and kernel2 after execution, then reuse them when it's kernel6's turn to execute, so kernel6 doesn't need to request extra memory). + - **Execution order obtained by DFS**: kernel1-> kernel2-> kernel3-> kernel4-> kernel5-> kernel6, memory peaks at 4G (kernel3 can release kernel1 and kernel2 after execution, then reuse them when it's kernel4 and kernel5's turn to execute, so kernel4 and kernel5 don't need to request extra memory). + +Execution order scheduling is a complex problem of solving optimal operator concurrency under certain memory constraints. It not only requires identifying and exploiting concurrency opportunities in the computational graph to improve computational efficiency, but also must consider multiple constraints simultaneously to ensure system stability and efficiency. + + - First, the optimization module needs to address the complexity of solving for optimal operator concurrency. Due to the large number of operators in the computational graph and their interdependencies, finding an execution order that maximizes concurrency while maintaining the logical correctness of the computational graph is a challenging task. + + - Second, memory constraints are a critical factor that cannot be ignored in execution order optimization. Increasing concurrency, while improving computational efficiency, tends to significantly increase peak memory requirements, which may lead to Out of Memory (OOM) errors, especially in resource-constrained environments. Therefore, the optimization module must weigh the relationship between concurrency and memory usage to ensure that concurrency is increased without exceeding the memory capacity of the system. + + - MindSpore's execution order adjustment module combines rule-based and heuristic-based strategies to provide both bfs/dfs execution order orchestration algorithms [mindspore.jit(option={"exec_order":"bfs/dfs"})](https://www.mindspore.cn/docs/zh-CN/master/api_python/mindspore/mindspore.jit.html) to achieve fine-grained adjustment of the execution order of the computation graph, thus effectively dealing with multiple challenges such as memory constraints and system stability while ensuring computational efficiency. + +### jit_level=O1 Mode + +Currently O1 mainly supports graph-operator fusion optimization. The main idea is: during the compilation phase, automatically identify neighboring fusable nodes in the computational graph, then fuse them into executable operators with larger granularity. Through graph-operator fusion, optimization effects such as increasing operator computational locality and reducing overall global memory access bandwidth overhead are achieved. Through real-world testing verification on mainstream SOTA models, O1 can achieve an average 15% performance acceleration compared to O0. Especially for memory access-intensive networks, the optimization effect of O1 is more significant. + +#### Graph-Kernel Fusion + +Mainstream AI computing frameworks such as MindSpore provide operators to users that are usually defined from the perspective of user understanding and ease of use. Each operator carries different amounts of computation and varies in computational complexity. However, from the hardware execution perspective, this natural, user perspective-based division of operator computation volume is not efficient and cannot fully utilize the computational power of hardware resources. This is mainly reflected in: + +1. Operators with too much computation and overly complex operators usually make it difficult to generate well-split high-performance operators, thereby reducing device utilization; + +2. Operators with too little computation may also cause computational latency and thus reduce device utilization, as the computation cannot effectively hide data movement overhead; + +3. Hardware devices are usually multi-core, many-core architectures. When operator shapes are small or other reasons cause insufficient computational parallelism, it may cause some cores to be idle, thus reducing device utilization. Especially chips based on Domain Specific Architecture (DSA for short) are more sensitive to these factors. How to maximize hardware computational performance while making operators easy to use has always been a big challenge. + +In terms of AI framework design, the current industry mainstream adopts a layered implementation approach of graph layer and operator layer. The graph layer is responsible for fusing or regrouping the computational graph, and the operator layer is responsible for compiling the fused or regrouped operators into high-performance executable operators. The graph layer usually uses Tensor-based High-Level IR for processing and optimization, while the operator layer uses computation instruction-based Low-Level IR for analysis and optimization. This artificial layered processing significantly increases the difficulty of collaborative optimization between the graph and computation layers. + +MindSpore has adopted the technique of graph-operator fusion to better solve this problem in the past few years of technical practice. Typical networks in different categories such as NLP and recommendation show significant gains in training speed after enabling graph-operator fusion. One of the main reasons is the presence of a large number of small operator combinations in these networks, which have more opportunities for fusion optimization. + +#### Graph-Kernel Fusion Architecture and Overall Process + +The overall architecture of graph-operator fusion is shown in the figure below. The main idea in the graph layer is to expand composite operators, then perform cross-boundary aggregation and optimization, and finally perform kernel operator splitting. The main steps include: + +1. Composite Expansion: Expand composite operators into basic operators and form composite subgraphs to facilitate subsequent cross-boundary optimization and operator splitting; + +2. Cross-OP Aggregation: Aggregate adjacent basic operators or composite subgraphs to form larger aggregated subgraphs for subsequent cross-boundary optimization and operator splitting; + +3. High-Level Optimization: Based on the aggregated subgraphs obtained in the above two steps, we can perform a large number of cross-boundary optimizations, such as algebraic simplification, common subexpression extraction (CSE), etc.; + +4. Kernel Partition: Based on computational features and fusion operator performance, perform operator splitting on the aggregated computational subgraph. + +The optimized computational graph is passed to MindSpore AKG as subgraphs for further backend optimization and target code generation. + +![graphkernel](./images/graphkernel.png) + +Through the above steps, we can obtain two aspects of performance gains: +1. Cross-boundary performance optimization gains between different operators; +2. Through reorganization and splitting of the entire computational graph, the optimal granularity of fusion operators is obtained. + +#### Fusion Operator Acceleration Optimization (MindSpore AKG) + +As mentioned earlier, in scenarios such as HPC and deep neural network training, graph-operator fusion optimization can bring exponential performance improvements. However, with the increasing capability of graph-operator fusion, the development of fusion operators has become a bottleneck point for continuing to improve graph-operator fusion capability. + +Automatic generation technology of fusion operators can solve the problem of high programming threshold for developing fusion operators based on DSA, allowing programmers to focus on operator implementation logic during operator development without focusing on backend optimization, greatly improving their development efficiency. Especially for scenarios with complex backend hardware architectures and the presence of complex operators and fusion operators, automatic operator generation technology is more critical. + +Therefore, **MindSpore AKG accelerates optimization and automatic generation of fusion operators based on Polyhedral Compilation Technology (Polyhedral Model)**, which can help fusion operators optimized by MindSpore's graph-operator fusion module to automatically generate high-performance kernels on **heterogeneous hardware platforms**(GPU/Ascend) and improve MindSpore training performance. + +The architecture and overall process are as follows: + +![graphkernel_akg_overview](./images/graphkernel_akg_overview.png) + +The overall framework of MindSpore AKG is shown in the figure above: + - IR Normalization + - The input of MindSpore AKG is the fusion subgraph optimized by MindSpore's graph-operator fusion module. The operators in the subgraph are expressed through various description methods such as TVM's Compute/IR Builder/Hybrid. Then the DSL is converted to [Halide](https://halide-lang.org/) IR (Halide, a common language used for developing high-performance image processing and array computation, which can be used as an intermediate representation to decouple algorithms and optimization) and IR normalization; + + - After initial simplification and optimization is completed, the Halide IR is transformed into the scheduling tree required by the Poly module; + + - Poly Module Scheduling Optimization + - Using the Pluto scheduling algorithm in polyhedral technology to achieve automatic loop fusion, automatic rearrangement and other transformations, automatically generating initial scheduling that satisfies parallelism and data locality for fusion operators; + + - To quickly adapt to different hardware backends, the optimization passes in the Poly module are divided into hardware-independent generic optimizations and hardware-related specific optimizations, which are stitched and combined according to hardware features at compilation time to achieve fast adaptation of heterogeneous hardware backends. Auto-slicing, auto-mapping, and auto-memory boosting passes will give different optimization methods according to the nature of different hardware architectures; + + - Backend Optimization + - To further improve operator performance, we developed corresponding optimization passes for different hardware backends, such as data alignment and instruction mapping in Ascend backend, vectorized access and insertion of synchronization instructions in GPU backend, and finally generate corresponding platform code. + +Summary: MindSpore compilation optimizes AI model code from various dimensions such as graph capture mode, IR optimization, graph-operator fusion, etc. Many features also face certain challenges in the trade-off between usability and performance. We also plan to further layer and decouple the entire process to avoid black-box operation and increase the threshold for developer understanding. \ No newline at end of file diff --git a/docs/mindspore/source_en/features/compile/graph_construction.md b/docs/mindspore/source_en/features/compile/graph_construction.md deleted file mode 100644 index bc30d9fe528badd17403bce9d9bae5e2ba1bc0da..0000000000000000000000000000000000000000 --- a/docs/mindspore/source_en/features/compile/graph_construction.md +++ /dev/null @@ -1,181 +0,0 @@ -# Graph Construction (Compilation) - -[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/master/docs/mindspore/source_en/features/compile/graph_construction.md) - -MindSpore provides JIT (just-in-time) technology to optimize the performance. The JIT mode parses the code into an intermediate representation (IR) graph by means of AST tree parsing, Python bytecode parsing or code execution tracing, which serves as a unique representation of the code, and the compiler optimizes the code by optimizing the IR graph to improve the runtime performance. In contrast to the dynamic graph model, this JIT compilation model is called the static graph model. - -Based on JIT technology, MindSpore provides a dynamic-static combination approach to improve the operational efficiency of the user's network. The combination of dynamic and static, that is, in the overall run as a dynamic graph, specifies certain code blocks to run as a static graph. Code blocks that run as static graphs are compiled first and then executed, and global optimizations are performed during the compilation period to obtain performance gains during the execution period. Users can modify functions with the `@jit` decorator to specify that they execute according to the pattern of a static graph. For the documentation on the `@jit` decorator, refer to [jit API documentation](https://www.mindspore.cn/docs/en/master/api_python/mindspore/mindspore.jit.html#mindspore.jit). - -MindSpore provides three JIT compilation methods, namely, ast, bytecode and trace. The ast converts the functions that are identified by the users manually and need to be executed in accordance with the ast into a static graph through the AST tree parsing. The bytecode is through the Python bytecode parsing, in the dynamic graph as much as possible to build a static graph. The part that can not be converted to a static graph will be in accordance with the dynamic graph for the purpose of combining static and dynamic. The trace constructs a static graph by tracing the execution path of Python code and is currently an experimental feature. Subsequent introduction will explain in detail the difference among the three principles and their respective characteristics. - -## Ast - -In dynamic graph mode, the user can modify a function to execute in ast mode by using the `@jit(capture_mode=“ast”)` decorator. The syntax and data structures used inside the functions which decorated by ast mode need to strictly follow the [Static Graph Syntax Specification](https://www.mindspore.cn/tutorials/en/master/compile/static_graph.html). The ast approach compiles Python code via a source-to-source method, which first parses the Python source code of model definitions into an Abstract Syntax Tree (AST), then converts the AST into MindIR. For example, the following Python code: - -```python -@jit -def foo(x, y): - z = x + y - return z -``` - -The corresponding AST is as follows: - -![image](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/docs/mindspore/source_zh_cn/features/compile/images/ast.png) - -By parsing the above AST, we obtain the following MindIR: - -```text -%para1_x: -%para2_y: - -subgraph instance: foo -subgraph @foo() { - %0(CNode_17) = PrimFunc_Add(%para1_x, %para2_y) - : (, ) -> () - Return(%0) - : () -} -``` - -**ast Usage** - -The user can specify that the function is to be executed as a static graph via the `@jit` decorator, for example: - -```python -import numpy as np -import mindspore as ms -from mindspore import ops -from mindspore import jit -from mindspore import Tensor - -@jit -def tensor_cal(x, y, z): - return ops.matmul(x, y) + z - -x = Tensor(np.ones(shape=[2, 3]), ms.float32) -y = Tensor(np.ones(shape=[3, 4]), ms.float32) -z = Tensor(np.ones(shape=[2, 4]), ms.float32) -ret = tensor_cal(x, y, z) -print(ret) -``` - -```text -[[4. 4. 4. 4.] - [4. 4. 4. 4.]] -``` - -In the above use case, the tensor_cal function is modified by the @jit decorator, and the function follows the pattern of the static graph when it is called in order to capture the performance gains during the execution period of the function. - -**Advantages** - -- With the ast model, users have more programming autonomy and more precise performance optimization, allowing them to tune the performance of the network to the optimal level based on function characteristics and usage experience. - -**Limitations** - -- Functions modified by ast must be programmed with an internal syntax that strictly adheres to the static graph. - -**Recommendations for the Use of the ast Model** - -- In contrast to dynamic graph execution, a function modified by `@jit` consumes some time to compile a static graph the first time it is called. On subsequent calls to the function, if the original compilation result can be reused, the original compilation result will be used for execution. As a result, functions that are executed multiple times using @jit decorator usually gain more performance benefits. - -- The operational efficiency advantage of the static graph pattern is that it optimizes the compilation of @jit-modified functions globally. The more operations a function contains, the higher the upper limit of optimization. Therefore, functions modified by the `@jit` decorator should ideally be large chunks of code with a lot of operations, rather than many small, fragmented functions with only a few operations tagged with a separate jit tag. Otherwise, there may be no performance gain or even degradation. - -- The vast majority of calculations and optimizations for MindSpore static graphs are based on optimizations for Tensor calculations, so we recommend that the functions that are modified should be the kind of functions that are used to perform real data calculations, rather than simple scalar calculations or transformations of data structures. - -- Functions modified by `@jit` that have constants in their inputs will result in a recompile each time that the function input value changes. See [Constants and Variables Within JIT](https://www.mindspore.cn/tutorials/en/master/compile/static_graph.html#constants-and-variables-within-jit) for the concept of variable constants. Therefore, it is recommended that the modified function takes as input Tensor or data modified by Mutable. Avoid additional performance loss due to multiple compilations. - -## Bytecode - -In addition to ast, MindSpore provides another static acceleration mechanism, bytecode, which allows the user to modify a function to execute in bytecode mode via the `@jit(capture_mode=“bytecode”)` decorator. When bytecode recognizes that the syntax for entering a static graph is not supported, it will fall back to Python for execution instead of compiling directly and reporting errors. This feature combines performance and ease of use to reduce the occurrence of compilation errors. It is based on the analysis of Python bytecode, graph capture of Python execution flow, allowing subgraphs that can be run as static graphs to be run as static graphs, and allowing subgraphs that are not supported by Python syntax to be run as dynamic graphs, as well as linking the dynamic-static graphs by modifying and adjusting the bytecode, so as to achieve a mixed execution of dynamic and static. While meeting the premise of ease of use, to improve performance as much as possible. - -**bytecode Operating Principle** - -1. Capture the execution of Python functions based on Python VM_PyInterpreterState_SetEvalFrameFunc, which captures the execution of all Python functions in the execution area using context management. -2. Analyze the function bytecode in conjunction with the current runtime input parameters to construct a control flow graph (CFG) and a data flow graph (DFG). -3. Simulate in-stack and out-stack operations, trace bytecode by bytecode, and derive the output based on the stack inputs. Python 3.7 to Python 3.11 has a corresponding simulation implementation for each bytecode, noting that the type size of the outputs is derived, not the actual execution of the values, unless the constants are collapsed. -4. During the simulated execution of the bytecode, translate the derivation results and operations into MindIR, and finally, optimize the static graph by constant folding, UD analysis (removing useless input and output parameters), etc. -5. Before executing the equivalent static graph, compare the input parameters with the caretaker Guard conditions generated during the optimization process, and based on the runtime information, select the matching static graph for execution. -6. Dynamically manage the matching relationship between Guard and static graph buffer, recycle the unused static graph buffer, and optimize the static graph buffer through Symbolic Shape and Dynamic Shape. - -The compilation process of bytecode is illustrated in the following diagram: - -![image](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/docs/mindspore/source_zh_cn/features/compile/images/bytecode.png) - -**bytecode Usage** - -Setting the capture_mode parameter of jit to bytecode switches the mode of operation of the modifier function to bytecode, for example: - -```python -import numpy as np -import mindspore as ms -from mindspore import ops -from mindspore import jit -from mindspore import Tensor - -@jit(capture_mode="bytecode") -def tensor_cal(x, y, z): - return ops.matmul(x, y) + z - -x = Tensor(np.ones(shape=[2, 3]), ms.float32) -y = Tensor(np.ones(shape=[3, 4]), ms.float32) -z = Tensor(np.ones(shape=[2, 4]), ms.float32) -ret = tensor_cal(x, y, z) -print(ret) -``` - -```text -[[4. 4. 4. 4.] - [4. 4. 4. 4.]] -``` - -**Advantages** - -- Good user experience, no human intervention, user-written web code always runs properly, and code that can't be executed by static graphs will automatically run using dynamic graphs. -- bytecode can make more statements into the static graph by transforming the byte code. Users do not need to perceive or modify the code. - -**Limitations** - -- Users can't explicitly do performance acceleration for certain code, and for scenarios with more cracked graphs, the performance acceleration may not be obvious. - -## Trace - -MindSpore also offers another static acceleration mechanism called trace. Users can decorate a function with the `@jit(capture_mode=“trace”)` decorator to execute the function in trace mode. In this mode, the code first runs in pynative mode, during which the operators executed at runtime are recorded and captured into the computation graph. Subsequent executions of the decorated code will directly execute the computation graph constructed during the first execution. This mechanism does not parse syntax but only captures the operators called during runtime, thus avoiding syntax-related errors. It captures the operators invoked during the execution of the pynative mode, captures the Python execution flow into a graph, and compiles the captured operators into the computation graph. Operations without corresponding operators will have their return values recorded as constants in the computation graph. The generated computation graph runs in the manner of static graph execution. - -**trace Usage** - -Setting the capture_mode parameter of jit to trace switches the mode of operation of the modifier function to trace, for example: - -```python -import numpy as np -import mindspore as ms -from mindspore import ops -from mindspore import jit -from mindspore import Tensor - -@jit(capture_mode="trace") -def tensor_cal(x, y, z): - return ops.matmul(x, y) + z - -x = Tensor(np.ones(shape=[2, 3]), ms.float32) -y = Tensor(np.ones(shape=[3, 4]), ms.float32) -z = Tensor(np.ones(shape=[2, 4]), ms.float32) -ret = tensor_cal(x, y, z) -print(ret) -``` - -```text -[[4. 4. 4. 4.] - [4. 4. 4. 4.]] -``` - -**Advantages of trace** - -- The graph construction capability is robust; as long as the code has corresponding operators, they can be captured into the graph without the need for additional adaptation. There will be no syntax-related errors when building the static graph. -- Good user experience, no human intervention, user-written web code always runs properly. - -**Limitations of trace** - -- It is unable to detect the control flow within the code, and correctness cannot be ensured in scenarios where different branches of the control flow are entered during multiple executions. -- Operations in the code that are not defined as operators, such as calls to third-party libraries, are fixed as constants in the computation graph, and correctness cannot be guaranteed across multiple runs. - diff --git a/docs/mindspore/source_en/features/compile/graph_optimization.md b/docs/mindspore/source_en/features/compile/graph_optimization.md deleted file mode 100644 index 00bb22bb1965adbb026a688f873251f0696e7e66..0000000000000000000000000000000000000000 --- a/docs/mindspore/source_en/features/compile/graph_optimization.md +++ /dev/null @@ -1,318 +0,0 @@ -# Graph Optimization (Compilation) - -[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/master/docs/mindspore/source_en/features/compile/graph_optimization.md) - -Similar to traditional compilers, MindSpore also performs compilation optimization after graph construction. The main purpose of compilation optimization is to analyze and transform MindSpore's intermediate representation MindIR by static analysis techniques to achieve goals such as reducing the size of the target code, improving execution efficiency, lowering runtime resource consumption, or enhancing other performance metrics. Compilation optimization is a crucial part of the graph compilation system and plays an extremely important role in improving the performance and resource utilization of the entire neural network model. Compared with the original code that has not been optimized, compilation optimization can bring several times or even tens of times performance improvement. - -This section mainly introduces front-end compilation optimization techniques that are independent of specific hardware. Hardware-specific back-end compilation optimization techniques are not within the scope of this discussion. - -## Principles of Front-End Compilation Optimization Techniques - -Similar to traditional compilation optimization techniques, compilation optimization in MindSpore is also carried out through a series of Passes. Each Pass takes the MindIR produced by the previous Pass as input and generates a new MindIR representation as output after optimization. A large Pass can include multiple smaller Passes, each of which is only responsible for a single point of compilation optimization, such as arithmetic simplify, inline, redundancy elimination and etc. The optimization results produced by one Pass may create optimization opportunities for other Passes, so these Passes can be run in a loop until the MindIR no longer changes. - -The selection of which Passes to run and how to arrange the execution order of these Passes has a very important impact on the final compilation result. Depending on the actual situation, the optimization actions to be performed can be adjusted by setting compilation optimization strategies (such as optimization levels, number of iterations, etc.). - -## Common Front-End Compilation Optimization Techniques - -There are many front-end compilation optimization techniques, such as arithmetic simplify, inline, and redundancy elimination. This section will introduce some representative compilation optimization techniques. - -### Arithmetic Simplify - -In traditional compilers, arithmetic simplify is a compiler optimization technique aimed at simplifying algebraic expressions in source code, eliminating redundant calculations, improving program execution efficiency, and reducing memory usage. - -For example, in the following code snippet: - -```cpp -int a = x * 1; -int b = x + 0; -int c = x * 0 + y * 1; -``` - -Traditional compilers perform equivalent substitution on recognized expressions based on algebraic rules and identities. Common algebraic rules include laws of union, commutative, and distributive, and compilers will try to replace expressions with simpler forms as much as possible. By analyzing AST or SSA analysis is used for optimization, identifying and simplifying code as follows: - -```cpp -a = x; -b = x; -c = y; -``` - -In the MindSpore compiler, the principle of arithmetic simplify is different from traditional compilers. It processes computational graphs rather than traditional control flow graphs. By adjusting the execution order of operators in the computational graph or deleting unnecessary operators, it maintains the simplicity of the graph and improves computational efficiency. - -For example, in the following Python code snippet: - -```python -import numpy as np -from mindspore.common import Tensor, jit - -@jit -def func(x): - return x + 0 - -m = Tensor(np.array([[1, 2, 3], [4, 5, 6]]).astype(np.int32)) -out = func(m) -``` - -The MindSpore graph compiler converts Python programs into computational graphs, which consist of multiple subgraphs. The algebraic operations in the source code are converted into operator calls within the subgraph, and it can be seen that the PrimFunc_Add operator is called once. - -```text -%para1_x: - -subgraph @1_func_14() { - %0(CNode_7) = PrimFunc_Add(%para1_x, Tensor(shape=[], dtype=Int32, value=0)) - : (, ) -> () - - Return(%0) - : () -} -``` - -By arithmetic simplify, the PrimFunc_Add operator can be directly removed to simplify the computational graph structure, reducing `x + 0` to `x`. - -```text -%para1_x: - -subgraph @1_func_14() { - Return(%para1_x) - : () -} -``` - -Arithmetic simplify can involve more modifications to the structure of computational graphs, and it is often combined with other compiler optimization techniques such as constant folding and constant propagation to improve program performance. - -### Inline - -In traditional compilers, inline is an optimization technique that replaces function calls with the actual code of the called function, improving program performance. For example, consider a C++ `add` function that sums two numbers: - -```cpp -int add(int a, int b) { - return a + b; -} - -int main() { - int x = add(3, 5); - int y = add(x, 10); - return y; -} -``` - -The compiler uses inline to directly insert the function body at the call site. This eliminates function call overhead and enables follow-up optimizations (e.g., replacing `3 + 5` with its result at compile time). **Replacing calls with code** is the core idea of inline. - -```cpp -int main() { - int x = 3 + 5; // Replace the first call. - int y = x + 10; // Replace the second call. - return y; -} -``` - -In AI frameworks' computational graph compilers, inline serves a similar purpose but operates on "subgraphs" instead of functions. For example, consider a Python program: - -```python -from mindspore import Tensor, jit, ops - -def f2(x: Tensor, y: Tensor): - return x * 0.5 + y - -@jit -def f1(a: Tensor, b: Tensor, c: Tensor): - x = f2(a, b) - y = f2(a, c) - return x + y - -# Create three Tensors with random values, each having a shape of (2, 4). -a = ops.randn(2, 4) -b = ops.randn(2, 4) -c = ops.randn(2, 4) -out = f1(a, b, c) -``` - -First, MindSpore's graph compiler converts the Python program into a computational graph. The function calls in the Python program are converted into calls between calculation graphs, and the original calculation graph is similar to the following. The main graph `f1` calls the subgraph `f2` twice. - -```text -# Params: -%para1_a: -%para2_b: -%para3_c: - -subgraph @f2(%para1_x, %para2_y) { - %0 = PrimFunc_Mul(%para1_x, Float32(0.5)) - - %1 = PrimFunc_Add(%0, %para2_y) - - Return(%1) -} - -subgraph @f1() { - %0(x) = call @f2(%para1_a, %para2_b) # Call subgraph f2 - - %1(y) = call @f2(%para1_a, %para3_c) # Call subgraph f2 - - %2 = PrimFunc_Add(%0, %1) - - Return(%2) -} -``` - -With inlining, the subgraph `f2` can be expanded and merged into the main graph `f1`. - -```text -subgraph @f1() { - # First-time subgraph inlining - %0 = PrimFunc_Mul(%para1_a, Float32(0.5)) # Repeated computation - %1 = PrimFunc_Add(%0, %para2_b) - - # Second-time subgraph inlining - %2 = PrimFunc_Mul(%para1_a, Float32(0.5)) # Repeated computation - %3 = PrimFunc_Add(%2, %para3_c) - - %4 = PrimFunc_Add(%1, %3) - - Return(%4) -} -``` - -Before inlining, the compiler might not detect repeated operations in the two calls to subgraph `f2` (as subgraphs are often treated as black boxes). After inlining, the compiler clearly sees `x * 0.5` calculated twice, enabling optimizations like **CSE** (Common Subexpression Elimination) to reduce redundant computations. - -```text -subgraph @f1() { - %0 = PrimFunc_Mul(%para1_a, Float32(0.5)) # CSE merges redundant computations - - %1 = PrimFunc_Add(%0, %para2_b) - - %2 = PrimFunc_Add(%0, %para3_c) # Directly reuse %0 - - %3 = PrimFunc_Add(%1, %2) - - Return(%3) -} -``` - -With inlining, compilers better identify cross-subgraph optimization opportunities. In addition to CSE, it enables operator fusion, memory management optimizations, and many other optimizations. Thus, inline is a critical optimization mechanism in computational graph compilers and a foundation for many cross-subgraph optimizations. - -### Redundancy Elimination - -In traditional compilers, redundancy elimination encompasses various compiler optimization techniques aimed at identifying and removing redundant parts of the code during compilation. This process is designed to reduce unnecessary computations and improve the execution efficiency of programs. - -Redundant code may be intentionally written by developers for readability purposes or may simply be an unintentional result of the coding process. Additionally, intermediate results generated by other optimization techniques during the compilation process (such as arithmetic simplify, inline and common subexpression elimination) may also create opportunities for redundancy elimination. - -There are many techniques for redundancy elimination. This section selects and introduces some of the common ones, including dead code elimination and unreachable code elimination. - -1. **Dead code elimination** - - Removing code whose results are not used. For example, in the following C++ code, the variable `c` is not used by any other code. Compilers can use data flow analysis techniques from the field of static analysis to eliminate the computation of code: `int c = x * y`. - - ```cpp - int func(x, y) { - int a = x + y; - int b = x - y; - int c = x * y; // Dead code - int d = a / b; - return d; - } - ``` - -2. **Unreachable code elimination** - - Removing code that is not included in any valid control flow path. For example, in the following C++ code, compilers can use control flow analysis techniques from the field of static analysis to analyze the control flow graph. They can identify that the expression `1 < 0` is always false, and thus the code within this control flow path will never be executed during actual runtime. Therefore, the code in this branch can be eliminated. - - ```cpp - int func(x, y) { - int a = x + y; - - int b; - if 1 < 0 { // Unreachable branch - b = x + y; - } else { - b = x - y; - } - - int d = a / b; - return d; - } - ``` - -In MindSpore's graph mode, the purpose and techniques of redundancy elimination are similar to those in traditional compilers. However, unlike traditional compilers, these redundancy optimization techniques are performed on MindIR. Similarly, common redundancy elimination techniques in MindSpore include: - -1. **Dead code elimination** - - For example, consider the following Python code with redundant computations: - - ```python - import mindspore as ms - from mindspore.common import Tensor, jit - - @jit - def func(x, y): - a = x + y - b = x - y - c = x * y # Dead code - d = a / b - return d - - x = Tensor(20, ms.float32) - y = Tensor(10, ms.float32) - out = func(x, y) - ``` - - The MindSpore graph compiler will convert the Python code decorated with `@jit` into the MindIR representation through static analysis and eliminate the redundant computation `c = x * y`. The resulting MindIR is as follows: - - ```text - # Params: - %para1_x: - %para2_y: - - subgraph @func_1() { - %0(a) = PrimFunc_Add(%para1_x, %para2_y) - : (, ) -> () - %1(b) = PrimFunc_Sub(%para1_x, %para2_y) - : (, ) -> () - %2(d) = PrimFunc_Div(%0, %1) - : (, ) -> () - Return(%2) - : () - } - ``` - -2. **Unreachable code elimination** - - For example, consider the following Python code with an unreachable path: - - ```python - import mindspore as ms - from mindspore.common import Tensor, jit - - @jit - def func(x, y): - a = x + y - if 1 < 0: # Unreachable branch - b = x + y - else: - b = x - y - d = a / b - return d - - x = Tensor(20, ms.float32) - y = Tensor(10, ms.float32) - out = func(x, y) - ``` - - The MindSpore graph compiler will convert the Python code decorated with `@jit` into the MindIR representation through static analysis and eliminate the redundant control flow branch `1 < 0`. The resulting MindIR is as follows: - - ```text - # Params: - %para1_x: - %para2_y: - - subgraph @func_1() { - %0(a) = PrimFunc_Add(%para1_x, %para2_y) - : (, ) -> () - %1(b) = PrimFunc_Sub(%para1_x, %para2_y) - : (, ) -> () - %2(d) = PrimFunc_Div(%0, %1) - : (, ) -> () - Return(%2) cnode_attrs: {checkpoint: Bool(1)} - : () - } - ``` - -Redundancy elimination plays a crucial role in compiler optimization. Without changing the original semantics of the program, it can significantly improve execution efficiency by reducing unnecessary runtime computations and saving computing resources. Redundancy elimination is often combined with other compiler optimization techniques to create more opportunities for eliminating redundant code. diff --git a/docs/mindspore/source_en/features/compile/images/graphkernel.png b/docs/mindspore/source_en/features/compile/images/graphkernel.png new file mode 100644 index 0000000000000000000000000000000000000000..06a5af7b00a222044a5a83bbde72a263fd6d3dd9 Binary files /dev/null and b/docs/mindspore/source_en/features/compile/images/graphkernel.png differ diff --git a/docs/mindspore/source_en/features/compile/images/graphkernel_akg_overview.png b/docs/mindspore/source_en/features/compile/images/graphkernel_akg_overview.png new file mode 100644 index 0000000000000000000000000000000000000000..18089ee5c51d88733d58ef22b9e394753116306b Binary files /dev/null and b/docs/mindspore/source_en/features/compile/images/graphkernel_akg_overview.png differ diff --git a/docs/mindspore/source_en/features/compile/multi_level_compilation.md b/docs/mindspore/source_en/features/compile/multi_level_compilation.md deleted file mode 100644 index 5ca3e25b7ab0587be5b12d11a0c5f088a22be5e6..0000000000000000000000000000000000000000 --- a/docs/mindspore/source_en/features/compile/multi_level_compilation.md +++ /dev/null @@ -1,137 +0,0 @@ -# Multi-Level Compilation Introduction (Compilation) - -[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/master/docs/mindspore/source_en/features/compile/multi_level_compilation.md) - -## Background - -With the arrival of the era of deep learning large models, the bigger the network size is, the bigger the challenge of graph compilation performance, execution performance and debugging and tuning efficiency is. For this reason, MindSpore proposes a multilevel compilation architecture that provides an O(n) multilevel compilation execution model, which are different from each other in terms of graph optimization, operator fusion, memory management, and execution modes, and is designed to provide a diversity of graph mode. Users can choose the most suitable compilation and execution mode according to their own network characteristics and needs: - -1. O0 mode: this is a basic compilation and execution mode, where all optimizations are turned off except those necessary to affect the functionality, and a single-calculus execution is used for execution. Therefore, the execution performance may not be optimal, but it can guarantee the original structure of the graph, which is convenient for users to debug and understand, and the compilation performance is also better. Add and Mul single operator execution is shown in the following figure. -2. O1 mode: this mode performs some basic optimizations, such as common graph optimization and automatic operator fusion optimization, and uses single operator execution for execution. Compared with O0, because of enabling the fusion optimization, the execution performance of O1 can be improved, but it may affect the original structure of the graph, so the compilation performance and debugging and tuning efficiency is lost. In the following figure, Add and Mul are fused into a single fused_op execution. -3. O2 mode: this is a more advanced optimization mode, currently not implemented, the subsequent deeper optimization can use this mode. - -![jit_level_example](./images/multi_level_compilation/jit_level_example.png) - -## Overview of Multi-Level Compilation Architecture - -![jit_level_framework](./images/multi_level_compilation/jit_level_framework.png) - -1. Multi-level compilation external interface: configure multi-level compilation level through [mindspore.jit(jit_level="O0/O1")](https://www.mindspore.cn/docs/en/master/api_python/mindspore/mindspore.jit.html#mindspore.jit), jit_level defaults to O0. We usually recommend that users use O0 mode for network debugging tuning. After debugging is ready, for better performance you can turn on O1 to run the network. -2. Backend graph compilation: According to the configured multi-level compilation level, different compilation modes are selected. O0 is the most basic native composition and compilation, and O1 adds automatic operator fusion function on the basis of O0, with the main functions of graph optimization, graph-operator fusion, operator selection, and execution sequence scheduling, of which graph-operator fusion is a unique function in O1 mode. -3. Backend graph execution: The O0 and O1 modes are the same at the execution level, and both use a single operator way of scheduling execution, with the main functions of multi-stream concurrency, multi-level streaming, HAL management, and memory management. - -## Introduction to the O0 Model - -O0 is the basic graph compilation and execution mode, except for the necessary impact on the functionality of the optimization, other optimizations are turned off, the use of native graph structure for compilation and execution, easy to debug and tuning, with better compilation performance. The following mainly introduces the functions related to backend graph compilation, and the functions related to backend graph execution are detailed in [runtime](https://www.mindspore.cn/docs/en/master/features/runtime/memory_manager.html). - -### Graph Optimization - -There are fewer graph optimizations for the O0 mode, and the basic optimizations are mainly back-end LazyInline and No-task node execution optimizations. - -- **Back-end LazyInline** - - **LazyInline**: The main idea is to postpone the overhead of the function call to the actual need to call , so that you can reduce the compilation overhead, improve compilation efficiency. LazyInline is the same sub-graph structure reuse in the graph compilation phase, do not unfolding placed in the graph, to avoid the graph size is large resulting in the impact of the compilation performance. - - ![jit_level_lazyinline](./images/multi_level_compilation/jit_level_lazyinline.png) - - **Pipeline Parallelism**: Slicing the operator in the neural network into multiple Stages, and then mapping the Stages to different devices, so that different devices to compute different parts of the neural network. In order to improve efficiency, pipeline parallelism further slices the MiniBatch into finer-grained MicroBatches, in which pipelined scheduling is used, thus achieving the goal of improving efficiency. - - **Back-end LazyInline**: Since MicroBatch slicing of Pipeline parallel leads to the expansion of the entire computational graph to a number of times of the MicroBatch, which results in a huge model size and long compilation performance time (possibly hour-level), and these Micro subgraphs are all structured the same way. In order to solve the compilation performance problem, the LazyInline technique is a great fit, however LazyInline brings problems such as inability to use the optimal way for memory reuse and stream allocation at runtime, inability to perform cross-graph optimization (memory optimization, communication fusion, operator fusion, etc.). For this reason, at the end of the compilation of the graph, before the execution of the graph, these Micro subgraphs are as the actual nodes of Inline in order to form a complete global whole graph, and then through memory optimization, communication optimization, redundant computation elimination after the graph Inline, so as to achieve the goal of compilation performance, execution performance, and execution memory are taken into account. - -- **No-task node Execution Optimization** - - ![jit_level_no_task](./images/multi_level_compilation/jit_level_no_task.png) - - No-task node refers to Reshape, ExpandDims, Squeeze, Flatten, FlattenGrad, Reformat, etc. There is no computational logic in these algorithms, and they do not modify the memory layout, but only modify the information of the shape, format. At the end of the compilation of the graph, the No-task node is converted to ref node, the output has the same address as the input, and the kernel launch is skipped in the execution process, so as to achieve the purpose of execution performance optimization. - -### Operator Selection - -Operators are the basic execution units in deep learning frameworks, and they are responsible for performing specific computational tasks, such as matrix multiplication, convolution, pooling. Operator selection requires comprehensive consideration of factors such as operator type, data type, hardware platform, and operator optimization in order to select the optimal operator for deep learning tasks. - -The operator types in the backend of MindSpore Ascend are Aclnn kernel/Aclop kernel/Hccl kernel/Cpu kernel, and the process of operator selection is shown as follows: - -![jit_level_kernelselect](./images/multi_level_compilation/jit_level_kernelselect.png) - -1. operator type: firstly, according to the type of operator, choose whether it is computational operator or communication operator. -2. hardware platform: If there is a corresponding operator on hardware, then the operator on hardware is preferred, otherwise the heterogeneous operator on CPU is chosen, e.g., shape-related computational operators may only be suitable to be supported on CPU, and there is no corresponding operator on hardware. -3. operator efficiency: due to the better performance of Aclnn operator on Ascend, the computational operator will prefer Aclnn kernel if there is a corresponding Aclnn kernel, otherwise Aclop kernel will be chosen. -4. If no operator is selected in any of the above 3 steps, it is an unsupported operator and the operator selection fails to exit. - -### Executing Order Organization - -![jit_level_exec_order](./images/multi_level_compilation/jit_level_exec_order.png) - -Different graph traversal algorithms produce execution orders with large differences in execution performance and memory, as shown in the figure above: - -- **Execution order obtained by BFS**: kernel1-> kernel2-> kernel4-> kernel5-> kernel3-> kernel6. Memory peaks at 5G (kernel3 can release kernel1 and kernel2 after execution, and then reuse them when it's kernel6's turn to execute, so kernel6 doesn't need to request extra memory). -- **Execution order obtained by DFS**: kernel1-> kernel2-> kernel3-> kernel4-> kernel5-> kernel6. Memory peaks at 4G (kernel3 can release kernel1 and kernel2 after execution, and then reuse them when it's kernel4 and kernel5's turn to execute, so kernel4 and kernel5 don't need to request extra memory). - -Execution order scheduling is a complex problem of solving optimal operator concurrency under certain memory constraints, which not only requires identifying and exploiting concurrency opportunities in the computational graph to improve computational efficiency, but also must consider multiple constraints at the same time to ensure the stability and efficiency of the system. - -- First, the optimization module needs to address the complexity of solving for optimal operator concurrency. Due to the large number of operators in the computational graph and their interdependencies, finding an execution order that maximizes concurrency while maintaining the logical correctness of the computational graph is a challenging task. -- Second, memory constraints are a critical factor that cannot be ignored in execution order optimization. Increasing concurrency, while improving computational efficiency, tends to significantly increase peak memory requirements, which may lead to Overflow of Memory (OOM) errors, especially in resource-constrained environments. Therefore, the optimization module must weigh the relationship between concurrency and memory usage to ensure that concurrency is increased without exceeding the memory capacity of the system. -- MindSpore's execution order adjustment module combines rule-based and heuristic-based strategies to provide both bfs/dfs execution order orchestration algorithms [mindspore.jit(option={“exec_order”: “bfs/dfs”})](https://www.mindspore.cn/docs/en/master/api_python/mindspore/mindspore.jit.html#mindspore.jit) to achieve fine-grained adjustment of the execution order of the computation graph, so as to effectively deal with multiple challenges such as memory constraints and system stability while ensuring computational efficiency. - -## Introduction to the O1 Model - -O1 is mainly targeted at implementing general-purpose, generalizable AI compilation optimizations on top of O0 to support better execution performance requirements for most general-purpose training and inference scenarios. - -In the current phase, O1 mainly supports graph-kernel fusion optimization. The main idea is to automatically identify neighboring fusable nodes in the computational graph during the static graph compilation phase, and then fuse them into executable operators with larger granularity. Through graph-kernel fusion, optimization effects such as increasing the computational locality of operators and reducing the overall global memory access bandwidth overhead are achieved. As verified by real-world tests on more than 15 networks, O1 is able to achieve an average of 15% performance acceleration compared to O0. Especially for access-intensive networks, the optimization effect of O1 is more significant. - -### Graph-Kernel Fusion - -Mainstream AI computing frameworks such as MindSpore provides operators to users that is usually defined in terms of understandable and easy use for user. Each operator carries a different amount of computation and varies in computational complexity. However, from the hardware execution point of view, this natural, user perspective-based division of operator computation volume is not efficient and does not fully utilize the computational power of hardware resources, which is mainly reflected in the following aspects: - -1. Computationally overloaded and overly complex operators, which usually makes it difficult to generate well-cut high-performance operator, thereby reducing equipment utilization. -2. Operators that are too small in computation may also cause latency in computation and thus reduce equipment utilization, as the computation cannot effectively hide the data moving overhead. -3. Hardware Devices are usually multi-core, many-core architectures. When the operator shape is small or other reasons cause insufficient computational parallelism, it may cause some cores to be idle, thus reducing the device utilization. In particular, chips based on Domain Specific Architecture (DSA for short) are more sensitive to these factors. It has been a big challenge to maximize the performance of hardware operator while making the operator easy to use. - -In terms of AI framework design, the current industry mainstream adopts a separate layer implementation approach of graph and operator layers. The graph layer is responsible for fusing or regrouping the computational graph, and the operator layer is responsible for compiling the fused or regrouped operators into high-performance executable operators. The graph layer is usually processed and optimized by using Tensor-based High-Level IR, while the operator layer is analyzed and optimized by using computational instruction-based Low-Level IR. This artificial separate-layer process significantly increases the difficulty of performing collaborative optimization in both graph and computational layers. - -MindSpore has adopted the technique of graph-kernel fusion to better solve this problem in the past few years. Typical networks in different categories such as NLP and recommendation show significant gains in training speed after enabling graph-kernel fusion. One of the main reasons is the presence of a large number of small operator combinations in these networks, which have more opportunities for fusion optimization. - -#### Graph-Kernel Fusion Architecture and Overall Process - -The overall architecture of graph-kernel fusion is shown in the figure below. The main idea in the graph layer is to turn on the composite operator, then perform cross-boundary aggregation and optimization, and finally perform Kernel operator splitting. The main steps include: - -1. Composite Expansion: Expand the composite operator into the basic operator and form the Composite subgraph to facilitate subsequent cross-boundary optimization and operator splitting. -2. Cross-OP Aggregation: Aggregate adjacent elementary operators or Composite subgraphs to form larger aggregated subgraphs for subsequent cross-boundary optimization and operator splitting. -3. High-Level Optimization: Based on the aggregated subgraphs obtained in the above two steps, we can perform a large number of cross-boundary optimizations, such as algebraic simplification, common subexpression extraction (CSE). -4. Kernel Partition: Based on the computational features and the performance of the fusion operator, the operator splitting is performed on the aggregated computational subgraph. - -The optimized computational graph is passed to MindSpore AKG as a subgraph for further back-end optimization and target code generation. - -![graphkernel](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/docs/mindspore/source_zh_cn/features/images/graphkernel.png) - -By following these steps, we can obtain two aspects of performance gains: - -1. Cross-boundary performance optimization gains between different operators. -2. The optimal granularity of the fusion operator is obtained by reorganizing and splitting the entire computational graph. - -#### Fusion Operator Acceleration Optimization (MindSpore AKG) - -As mentioned earlier, in scenarios such as HPC and deep neural network training, graph-kernel fusion optimization can bring exponential performance improvements. However, with the increasing capability of graph-kernel fusion, the development of fusion operator becomes a bottleneck point to continue to improve the graph-kernel fusion capability. - -The automatic generation technology of fusion operators can solve the problem of high programming threshold for developing fusion operators based on DSA, allowing programmers to focus on the implementation logic of operators during operator development without focusing on back-end optimization, which greatly improves their development efficiency. Especially for scenarios with complex back-end hardware architectures and the presence of complex operators and fusion operators, automatic operator generation techniques are more critical. - -Therefore, **MindSpore AKG accelerates optimization and automatic generation of fusion operator based on Polyhedral Compilation Technology (Polyhedral Model)**, can help fused operators optimized by MindSpore graph-kernel fusion module to automatically generate high-performance kernel on **heterogeneous hardware platforms** (GPU/Ascend) and improve MindSpore training performance. - -Architecture and Overall Process are as follows: - -The overall framework of MindSpore AKG is shown in the figure above: - -- IR Normalization - - The input of MindSpore AKG is the fused subgraph optimized by MindSpore graph-kernel fusion module, and the operator in the subgraph is expressed by various descriptions such as TVM's Compute/IR Builder/Hybrid. The DSL is then converted to Halide IR ([Halide](https://halide-lang.org/), a common language used to develop high-performance image processing and Array computation, which can be used as an intermediate expression for decoupling algorithms and optimization) and IR normalization. - - After the initial simplification and optimization is completed, the Halide IR is transformed into the scheduling tree required by the Poly module. -- Poly module scheduling optimization - - Using the Pluto scheduling algorithm in Polyhedral technology to achieve automatic fusion of loops, automatic rearrangement and other transformations to automatically generate an initial schedule that satisfies parallelism and data locality for the fusion operator. - - To quickly adapt to different hardware backends, the optimization pass in the Poly module is divided into hardware-independent generic optimizations and hardware-related specific optimizations, which are stitched and combined according to hardware features at compilation time, to achieve fast adaptation of heterogeneous hardware backends. The pass such as Auto-slicing, auto-mapping and auto-memory boosting will give different optimizations depending on the nature of the hardware architecture. -- Backends optimization - - In order to further improve the performance of the operator, we developed corresponding optimization passes for different hardware backends, such as data alignment and instruction mapping in Ascend backend, vectorized access and insertion of synchronization instructions in GPU backend, and finally generated the corresponding platform code. - -### Other Graph Optimization Techniques - -In addition to graph-kernel fusion, O1 may be gradually extended to add some other graph optimization techniques in subsequent releases. For example: - -1. KernelPacket: automatic fusion and optimization of shape computations in dynamic shape scenarios; -2. Communicative-kernel fusion: fusion of communication operators with computational operators. diff --git a/docs/mindspore/source_en/features/data_engine.md b/docs/mindspore/source_en/features/data_engine.md index fada48b563d61b9066e17ffa3fd874abeaacfa9a..256403c7b0b70fac8e1d1d0fab10ebb251c357f4 100644 --- a/docs/mindspore/source_en/features/data_engine.md +++ b/docs/mindspore/source_en/features/data_engine.md @@ -14,7 +14,7 @@ The core of MindSpore training data processing engine is to efficiently and flex - Provide an automatic data augmentation mode, and perform automatic data augmentation on images based on specific strategies. - Provide single-node data caching capability to solve the problem of repeated loading and processing of data, reduce data processing overhead, and improve device-to-device training efficiency. -Please refer to the instructions for usage: [Data Loading And Processing](https://www.mindspore.cn/docs/en/master/features/dataset/overview.html) +Please refer to the instructions for usage: [Data Loading And Processing](https://www.mindspore.cn/tutorials/en/master/dataset/overview.html) ![image](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/docs/mindspore/source_en/features/images/data/data_engine_en.png) diff --git a/docs/mindspore/source_en/features/overview.md b/docs/mindspore/source_en/features/overview.md index 9d49376aa4bf3db34710b0d1a161396573228c12..c1385397f486d0fd07c7c1c396640f9d2a2ba52c 100644 --- a/docs/mindspore/source_en/features/overview.md +++ b/docs/mindspore/source_en/features/overview.md @@ -31,19 +31,21 @@ The overall architecture of MindSpore is as follows: MindSpore is a full-scenario deep learning framework designed to achieve three major goals: easy development, efficient execution, and unified deployment across all scenarios. Easy development is reflected in API friendliness and low debugging difficulty; efficient execution includes computational efficiency, data preprocessing efficiency, and distributed training efficiency; full-scenario means the framework simultaneously supports cloud, edge, and device-side scenarios. -### Fusion of Functional and Object-Oriented Programming Paradigms +### Programming Paradigms and Experience (pynative + graph mode) + +#### Fusion of Functional and Object-Oriented Programming MindSpore provides both object-oriented and function-oriented programming paradigms, both of which can be used to construct network algorithms and training processes. Developers can derive from the nn.Cell class to define AI networks or layers with required functionality, and assemble various defined layers through nested object calls to complete the definition of the entire AI network. -At the same time, developers can also define a pure Python function that can be source-to-source compiled by MindSpore, and accelerate its execution through functions or decorators provided by MindSpore. Under the requirements of MindSpore's static syntax, pure Python functions can support nested subfunctions, control logic, and even recursive function expressions. Therefore, based on this programming paradigm, developers can flexibly enable certain functional features, making it easier to express business logic. +At the same time, developers can also define a pure Python function that can be source-to-source compiled by MindSpore, and accelerate its execution through functions or decorators provided by MindSpore. Under the requirements of MindSpore graph mode syntax, pure Python functions can support nested subfunctions, control logic, and even recursive function expressions. Therefore, based on this programming paradigm, developers can flexibly enable certain functional features, making it easier to express business logic. MindSpore implements functional differential programming, which performs differentiation based on the call chain according to the calling relationship for function objects that can be differentiated. This automatic differentiation strategy better aligns with mathematical semantics and has an intuitive correspondence with composite functions in basic algebra. As long as the derivative formulas of basic functions are known, the derivative formula of a composite function composed of any basic functions can be derived. At the same time, based on the functional programming paradigm, MindSpore provides rich higher-order functions such as vmap, shard, and other built-in higher-order functions. Like the differential function grad, these allow developers to conveniently construct a function or object as a parameter for higher-order functions. Higher-order functions, after internal compilation optimization, generate optimized versions of developers' functions, implementing features such as vectorization transformation and distributed parallel partitioning. -### [Unified Programming Experience for Dynamic and Static Graphs](https://www.mindspore.cn/docs/en/master/features/program_form/overview.html) +#### Unified Programming Experience for Pynative and Graph Mode Traditional AI frameworks mainly have two programming execution forms: static graph mode and dynamic eager mode. @@ -67,16 +69,18 @@ MindSpore introduced Tensor Redistribution (TR) technology in parallel strategy At the same time, MindSpore also provides various parallel strategies such as pipeline parallelism, optimizer parallelism, and recomputation for developers to use. -### High-Performance Hardware Utilization +### Compilation Based on compilation technology, MindSpore provides rich hardware-independent optimizations such as IR fusion, algebraic simplification, constant folding, and common subexpression elimination. At the same time, it also provides various hardware optimization capabilities for different hardware such as NPU and GPU, thereby better leveraging the large-scale computational acceleration capabilities of hardware. -#### [Graph-Algorithm Fusion](https://www.mindspore.cn/docs/en/master/features/compile/multi_level_compilation.html#graph-kernel-fusion) +#### [Multi-Level Compilation Architecture](https://www.mindspore.cn/docs/en/master/features/compile/compilation_guide.html) Mainstream AI computing frameworks like MindSpore typically define operators from the perspective of developer understanding and ease of use. Each operator carries varying amounts of computation and computational complexity. However, from a hardware execution perspective, this natural operator computational division based on the developer's perspective is not efficient and cannot fully utilize hardware computational capabilities. This is mainly reflected in: 1. Operators with excessive computational volume or complexity are usually difficult to generate well-partitioned high-performance operators, thereby reducing device utilization; + 2. Operators with too little computational volume may cause computational waiting latency due to the inability to effectively hide data movement overhead, thereby reducing device utilization; + 3. Hardware devices are typically multi-core or many-core structures, and when operator shapes are small or other reasons cause insufficient computational parallelism, some cores may remain idle, thereby reducing device utilization. This is especially sensitive for chips based on Domain Specific Architecture (DSA). How to maximize hardware computational performance while ensuring operators are easy to use has always been a significant challenge. In terms of AI framework design, the industry mainstream currently adopts a layered implementation method with graph layer and operator layer. The graph layer is responsible for fusing or reorganizing the computational graph, while the operator layer is responsible for compiling the fused or reorganized operators into high-performance executable operators. The graph layer typically uses High-Level IR processing and optimization based on Tensor, while the operator layer uses Low-Level IR analysis and optimization based on computational instructions. This artificial layering significantly increases the difficulty of coordinated optimization between the graph and operator layers. @@ -101,6 +105,6 @@ MindSpore is an AI framework that integrates training and inference, supporting According to actual execution environments and business requirements, MindSpore provides multiple specification versions, supporting deployment on cloud, servers, mobile and other embedded devices, and ultra-lightweight devices such as earphones. -### [Third-Party Hardware Integration](https://www.mindspore.cn/docs/en/master/features/runtime/pluggable_device.html) +### Third-Party Hardware Integration Based on the unified MindIR, MindSpore has built an open AI architecture that supports third-party chip plugins, standardization, and low-cost rapid integration, which can connect to GPU series chips as well as various DSA chips. MindSpore provides two chip integration methods: Kernel mode and Graph mode, allowing chip manufacturers to choose the integration method according to their own characteristics. diff --git a/docs/mindspore/source_en/features/runtime/memory_manager.md b/docs/mindspore/source_en/features/runtime/memory_manager.md index 1162827adce30c41f68c497e4da76979301607b9..c51fcf785cbd9ba3898c0f727feab7014c3a162e 100644 --- a/docs/mindspore/source_en/features/runtime/memory_manager.md +++ b/docs/mindspore/source_en/features/runtime/memory_manager.md @@ -9,7 +9,7 @@ Device memory (hereinafter referred to as memory) is the most important resource 1. Memory pool serves as a base for memory management and can effectively avoid the overhead of frequent dynamic allocation of memory. 2. Memory reuse algorithm, as a core competency in memory management, needs to have efficient memory reuse results as well as minimal memory fragmentation. -![memory_manager](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/docs/mindspore/source_en/features/images/multi_level_compilation/jit_level_memory_manage.png) +![memory_manager](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/docs/mindspore/source_en/features/compile/images/multi_level_compilation/jit_level_memory_manage.png) ## Interfaces @@ -22,7 +22,7 @@ The memory management-related interfaces are detailed in [runtime interfaces](ht The core idea of memory pool as a base for memory management is to pre-allocate a large block of contiguous memory, allocate it directly from the pool when applying for memory, and return it to the pool for reuse when releasing it, instead of frequently calling the memory application and release interfaces in the system, which reduces the overhead of frequent dynamic allocations, and improves system performance. MindSpore mainly uses the BestFit memory allocation algorithm, supports dynamic expansion of memory blocks and defragmentation, and sets the initialization parameters of the memory pool through the interface [mindspore.runtime.set_memory(init_size,increase_size,max_size)](https://www.mindspore.cn/docs/en/master/api_python/runtime/mindspore.runtime.set_memory.html) to control the dynamic expansion size and maximum memory usage. -![memory_pool](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/docs/mindspore/source_en/features/images/multi_level_compilation/jit_level_memory_pool.png) +![memory_pool](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/docs/mindspore/source_en/features/compile/images/multi_level_compilation/jit_level_memory_pool.png) 1. Slicing operation: When memory is allocated, free areas are sorted according to their sizes, the first free area that meets the requirements is found, allocated on demand, the excess is cut, and a new block of free memory is inserted. 2. Merge operation: When memory is reclaimed, neighboring free memory blocks are reclaimed and merged into one large free memory block. diff --git a/docs/mindspore/source_en/features/runtime/multilevel_pipeline.md b/docs/mindspore/source_en/features/runtime/multilevel_pipeline.md index 235120d55c435a9349a24584098aea1a7f62f1e6..252fd765a089b605229f9efa137792fbee0e2d29 100644 --- a/docs/mindspore/source_en/features/runtime/multilevel_pipeline.md +++ b/docs/mindspore/source_en/features/runtime/multilevel_pipeline.md @@ -12,7 +12,7 @@ Runtime scheduling for an operator mainly includes the operations InferShape (in Multi-stage flow is a key performance optimization point for runtime, which improves runtime scheduling efficiency by task decomposition and parallel flow issued to give full play to CPU multi-core performance. The main flow is as follows: -![rt_pipeline](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/docs/mindspore/source_en/features/images/multi_level_compilation/jit_level_rt_pipeline.png) +![rt_pipeline](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/docs/mindspore/source_en/features/compile/images/multi_level_compilation/jit_level_rt_pipeline.png) 1. Task decomposition: the operator scheduling is decomposed into three tasks InferShape, Resize and Launch. 2. Queue creation: Create three queues, Infer Queue, Resize Queue and Launch Queue, for taking over the three tasks in step 1. diff --git a/docs/mindspore/source_en/features/runtime/multistream_concurrency.md b/docs/mindspore/source_en/features/runtime/multistream_concurrency.md index b99d645b3ffac040d2061977cdcfea8d372adda0..db9291024e4305e603a7020cbf1859042f2f15b0 100644 --- a/docs/mindspore/source_en/features/runtime/multistream_concurrency.md +++ b/docs/mindspore/source_en/features/runtime/multistream_concurrency.md @@ -10,7 +10,7 @@ During the training of large-scale deep learning models, the importance of commu Traditional multi-stream concurrency methods usually rely on manual configuration, which is not only cumbersome and error-prone, but also often difficult to achieve optimal concurrency when faced with complex computational graphs. MindSpore's automatic stream assignment feature automatically identifies and assigns concurrency opportunities in the computational graph by means of an intelligent algorithm, and assigns different operators to different streams for execution. This automated allocation process not only simplifies user operations, but also enables dynamic adjustment of stream allocation policies at runtime to accommodate different computing environments and resource conditions. -![multi_stream](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/docs/mindspore/source_en/features/images/multi_level_compilation/jit_level_multi_stream.png) +![multi_stream](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/docs/mindspore/source_en/features/compile/images/multi_level_compilation/jit_level_multi_stream.png) The principles are as follows: diff --git a/docs/mindspore/source_en/note/api_mapping/pytorch_api_mapping.md b/docs/mindspore/source_en/note/api_mapping/pytorch_api_mapping.md index 9b2e36310364f22e4303ddb93bf788c0760d5e42..81a23c966e56808e8bffdac4da6973cc0289d9de 100644 --- a/docs/mindspore/source_en/note/api_mapping/pytorch_api_mapping.md +++ b/docs/mindspore/source_en/note/api_mapping/pytorch_api_mapping.md @@ -292,9 +292,9 @@ Because of the framework mechanism, MindSpore does not provide the following par | [torch.nn.AvgPool2d](https://PyTorch.org/docs/2.1/generated/torch.nn.AvgPool2d.html) | [mindspore.mint.nn.AvgPool2d](https://www.mindspore.cn/docs/en/master/api_python/mint/mindspore.mint.nn.AvgPool2d.html) | [Consistent](https://www.mindspore.cn/docs/en/master/note/api_mapping/pytorch_api_mapping.html#api-mapping-consistency-criteria-and-exceptions) | | [torch.nn.BCELoss](https://PyTorch.org/docs/2.1/generated/torch.nn.BCELoss.html) | [mindspore.mint.nn.BCELoss](https://www.mindspore.cn/docs/en/master/api_python/mint/mindspore.mint.nn.BCELoss.html) | [Consistent](https://www.mindspore.cn/docs/en/master/note/api_mapping/pytorch_api_mapping.html#api-mapping-consistency-criteria-and-exceptions) | | [torch.nn.BCEWithLogitsLoss](https://pytorch.org/docs/2.1/generated/torch.nn.BCEWithLogitsLoss.html) | [mindspore.mint.nn.BCEWithLogitsLoss](https://www.mindspore.cn/docs/en/master/api_python/mint/mindspore.mint.nn.BCEWithLogitsLoss.html) | [Consistent](https://www.mindspore.cn/docs/en/master/note/api_mapping/pytorch_api_mapping.html#api-mapping-consistency-criteria-and-exceptions) | -| [torch.nn.BatchNorm1d](https://PyTorch.org/docs/2.1/generated/torch.nn.BatchNorm1d.html) | [mindspore.mint.nn.BatchNorm1d](https://www.mindspore.cn/docs/en/master/api_python/mint/mindspore.mint.nn.BatchNorm1d.html) | [Consistent](https://www.mindspore.cn/docs/en/master/note/api_mapping/pytorch_api_mapping.html#api-mapping-consistency-criteria-and-exceptions)| -| [torch.nn.BatchNorm2d](https://PyTorch.org/docs/2.1/generated/torch.nn.BatchNorm2d.html) | [mindspore.mint.nn.BatchNorm2d](https://www.mindspore.cn/docs/en/master/api_python/mint/mindspore.mint.nn.BatchNorm2d.html) | [Consistent](https://www.mindspore.cn/docs/en/master/note/api_mapping/pytorch_api_mapping.html#api-mapping-consistency-criteria-and-exceptions)| -| [torch.nn.BatchNorm3d](https://PyTorch.org/docs/2.1/generated/torch.nn.BatchNorm3d.html) | [mindspore.mint.nn.BatchNorm3d](https://www.mindspore.cn/docs/en/master/api_python/mint/mindspore.mint.nn.BatchNorm3d.html) | [Consistent](https://www.mindspore.cn/docs/en/master/note/api_mapping/pytorch_api_mapping.html#api-mapping-consistency-criteria-and-exceptions)| +| [torch.nn.BatchNorm1d](https://PyTorch.org/docs/2.1/generated/torch.nn.BatchNorm1d.html) | [mindspore.mint.nn.BatchNorm1d](https://www.mindspore.cn/docs/en/master/api_python/mint/mindspore.mint.nn.BatchNorm1d.html) | Consistent functions, MindSpore is in inference mode by default. | +| [torch.nn.BatchNorm2d](https://PyTorch.org/docs/2.1/generated/torch.nn.BatchNorm2d.html) | [mindspore.mint.nn.BatchNorm2d](https://www.mindspore.cn/docs/en/master/api_python/mint/mindspore.mint.nn.BatchNorm2d.html) | Consistent functions, MindSpore is in inference mode by default. | +| [torch.nn.BatchNorm3d](https://PyTorch.org/docs/2.1/generated/torch.nn.BatchNorm3d.html) | [mindspore.mint.nn.BatchNorm3d](https://www.mindspore.cn/docs/en/master/api_python/mint/mindspore.mint.nn.BatchNorm3d.html) | Consistent functions, MindSpore is in inference mode by default. | | [torch.nn.ConstantPad1d](https://pytorch.org/docs/2.1/generated/torch.nn.ConstantPad1d.html) | [mindspore.mint.nn.ConstantPad1d](https://www.mindspore.cn/docs/en/master/api_python/mint/mindspore.mint.nn.ConstantPad1d.html) | [Consistent](https://www.mindspore.cn/docs/en/master/note/api_mapping/pytorch_api_mapping.html#api-mapping-consistency-criteria-and-exceptions) | | [torch.nn.ConstantPad2d](https://pytorch.org/docs/2.1/generated/torch.nn.ConstantPad2d.html) | [mindspore.mint.nn.ConstantPad2d](https://www.mindspore.cn/docs/en/master/api_python/mint/mindspore.mint.nn.ConstantPad2d.html) | [Consistent](https://www.mindspore.cn/docs/en/master/note/api_mapping/pytorch_api_mapping.html#api-mapping-consistency-criteria-and-exceptions) | | [torch.nn.ConstantPad3d](https://pytorch.org/docs/2.1/generated/torch.nn.ConstantPad3d.html) | [mindspore.mint.nn.ConstantPad3d](https://www.mindspore.cn/docs/en/master/api_python/mint/mindspore.mint.nn.ConstantPad3d.html) | [Consistent](https://www.mindspore.cn/docs/en/master/note/api_mapping/pytorch_api_mapping.html#api-mapping-consistency-criteria-and-exceptions) | @@ -303,7 +303,7 @@ Because of the framework mechanism, MindSpore does not provide the following par | [torch.nn.ConvTranspose2d](https://pytorch.org/docs/2.1/generated/torch.nn.ConvTranspose2d.html) | [mindspore.mint.nn.ConvTranspose2d](https://www.mindspore.cn/docs/en/master/api_python/mint/mindspore.mint.nn.ConvTranspose2d.html) | [Consistent](https://www.mindspore.cn/docs/en/master/note/api_mapping/pytorch_api_mapping.html#api-mapping-consistency-criteria-and-exceptions) | | [torch.nn.CrossEntropyLoss](https://pytorch.org/docs/2.1/generated/torch.nn.CrossEntropyLoss.html) | [mindspore.mint.nn.CrossEntropyLoss](https://www.mindspore.cn/docs/en/master/api_python/mint/mindspore.mint.nn.CrossEntropyLoss.html) | [Consistent](https://www.mindspore.cn/docs/en/master/note/api_mapping/pytorch_api_mapping.html#api-mapping-consistency-criteria-and-exceptions) | | [torch.nn.Dropout](https://pytorch.org/docs/2.1/generated/torch.nn.Dropout.html) | [mindspore.mint.nn.Dropout](https://www.mindspore.cn/docs/en/master/api_python/mint/mindspore.mint.nn.Dropout.html) | Consistent functions, MindSpore is in inference mode by default. | -| [torch.nn.Dropout2d](https://pytorch.org/docs/2.1/generated/torch.nn.Dropout2d.html) | [mindspore.mint.nn.Dropout2d](https://www.mindspore.cn/docs/en/master/api_python/mint/mindspore.mint.nn.Dropout2d.html) | Consistent functions, MindSpore has no parameter inplace, MindSpore is in inference mode by default. | +| [torch.nn.Dropout2d](https://pytorch.org/docs/2.1/generated/torch.nn.Dropout2d.html) | [mindspore.mint.nn.Dropout2d](https://www.mindspore.cn/docs/en/master/api_python/mint/mindspore.mint.nn.Dropout2d.html) | Consistent functions, MindSpore is in inference mode by default. | | [torch.nn.ELU](https://pytorch.org/docs/2.1/generated/torch.nn.ELU.html) | [mindspore.mint.nn.ELU](https://www.mindspore.cn/docs/en/master/api_python/mint/mindspore.mint.nn.ELU.html) |Consistent functions, MindSpore has no parameter inplace. | | [torch.nn.Embedding](https://pytorch.org/docs/2.1/generated/torch.nn.Embedding.html) | [mindspore.mint.nn.Embedding](https://www.mindspore.cn/docs/en/master/api_python/mint/mindspore.mint.nn.Embedding.html) | [Consistent](https://www.mindspore.cn/docs/en/master/note/api_mapping/pytorch_api_mapping.html#api-mapping-consistency-criteria-and-exceptions) | | [torch.nn.Flatten](https://pytorch.org/docs/2.1/generated/torch.nn.Flatten.html) | [mindspore.mint.flatten](https://www.mindspore.cn/docs/en/master/api_python/mint/mindspore.mint.flatten.html) | [Consistent](https://www.mindspore.cn/docs/en/master/note/api_mapping/pytorch_api_mapping.html#api-mapping-consistency-criteria-and-exceptions) | @@ -337,7 +337,7 @@ Because of the framework mechanism, MindSpore does not provide the following par | [torch.nn.SmoothL1Loss](https://pytorch.org/docs/2.1/generated/torch.nn.SmoothL1Loss.html) | [mindspore.mint.nn.SmoothL1Loss](https://www.mindspore.cn/docs/en/master/api_python/mint/mindspore.mint.nn.SmoothL1Loss.html) | [Consistent](https://www.mindspore.cn/docs/en/master/note/api_mapping/pytorch_api_mapping.html#api-mapping-consistency-criteria-and-exceptions) | | [torch.nn.Softmax](https://pytorch.org/docs/2.1/generated/torch.nn.Softmax.html) | [mindspore.mint.nn.Softmax](https://www.mindspore.cn/docs/en/master/api_python/mint/mindspore.mint.nn.Softmax.html) | [Consistent](https://www.mindspore.cn/docs/en/master/note/api_mapping/pytorch_api_mapping.html#api-mapping-consistency-criteria-and-exceptions)| | [torch.nn.Softshrink](https://pytorch.org/docs/2.1/generated/torch.nn.Softshrink.html) | [mindspore.mint.nn.Softshrink](https://www.mindspore.cn/docs/en/master/api_python/mint/mindspore.mint.nn.Softshrink.html) | [Consistent](https://www.mindspore.cn/docs/en/master/note/api_mapping/pytorch_api_mapping.html#api-mapping-consistency-criteria-and-exceptions)| -| [torch.nn.SyncBatchNorm](https://pytorch.org/docs/2.1/generated/torch.nn.SyncBatchNorm.html) | [mindspore.mint.nn.SyncBatchNorm](https://www.mindspore.cn/docs/en/master/api_python/mint/mindspore.mint.nn.SyncBatchNorm.html) | [Consistent](https://www.mindspore.cn/docs/en/master/note/api_mapping/pytorch_api_mapping.html#api-mapping-consistency-criteria-and-exceptions) | +| [torch.nn.SyncBatchNorm](https://pytorch.org/docs/2.1/generated/torch.nn.SyncBatchNorm.html) | [mindspore.mint.nn.SyncBatchNorm](https://www.mindspore.cn/docs/en/master/api_python/mint/mindspore.mint.nn.SyncBatchNorm.html) | Consistent functions, MindSpore is in inference mode by default. | | [torch.nn.Tanh](https://pytorch.org/docs/2.1/generated/torch.nn.Tanh.html) | [mindspore.mint.nn.Tanh](https://www.mindspore.cn/docs/en/master/api_python/mint/mindspore.mint.nn.Tanh.html) | [Consistent](https://www.mindspore.cn/docs/en/master/note/api_mapping/pytorch_api_mapping.html#api-mapping-consistency-criteria-and-exceptions) | | [torch.nn.Unfold](https://pytorch.org/docs/2.1/generated/torch.nn.Unfold.html) | [mindspore.mint.nn.Unfold](https://www.mindspore.cn/docs/en/master/api_python/mint/mindspore.mint.nn.Unfold.html) | [Consistent](https://www.mindspore.cn/docs/en/master/note/api_mapping/pytorch_api_mapping.html#api-mapping-consistency-criteria-and-exceptions)| | [torch.nn.Upsample](https://pytorch.org/docs/2.1/generated/torch.nn.Upsample.html) | [mindspore.mint.nn.Upsample](https://www.mindspore.cn/docs/en/master/api_python/mint/mindspore.mint.nn.Upsample.html) | [Consistent](https://www.mindspore.cn/docs/en/master/note/api_mapping/pytorch_api_mapping.html#api-mapping-consistency-criteria-and-exceptions) | diff --git a/docs/mindspore/source_zh_cn/conf.py b/docs/mindspore/source_zh_cn/conf.py index 85ad9bfa0b764ec91f5b21b7381247135267c772..b0bf72bde006038b3d54f88a0988d8501acec9af 100644 --- a/docs/mindspore/source_zh_cn/conf.py +++ b/docs/mindspore/source_zh_cn/conf.py @@ -101,6 +101,7 @@ with open(autodoc_source_path, "r", encoding="utf8") as f: exec(get_param_func_str, sphinx_autodoc.__dict__) exec(code_str, sphinx_autodoc.__dict__) +# 排除已写中文接口名 with open("../_ext/customdocumenter.txt", "r", encoding="utf8") as f: code_str = f.read() exec(code_str, sphinx_autodoc.__dict__) @@ -126,7 +127,7 @@ with open(sphinx_mathjax.__file__, "r", encoding="utf-8") as f: project = 'MindSpore' copyright = 'MindSpore' author = 'MindSpore' -# language = 'cn' + # The full version, including alpha/beta/rc tags release = 'master' @@ -163,8 +164,6 @@ source_suffix = { # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] -# locale_dirs = ['locale/'] - # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. @@ -352,7 +351,7 @@ for root, dirs, files in os.walk(api_file_dir, topdown=True): if '.rst' in file_ or '.txt' in file_: convert2utf8(os.path.join(root, file_)) -# Rename .rst file to .txt file for include directive. +# Rename .rst file to .txt file for include directive. master使用 from rename_include import rename_include rename_include('api_python') @@ -361,6 +360,15 @@ rename_include('migration_guide') # modify urls import json +# 发版本时这里启用 +# re_url = r"(((gitee.com/mindspore/docs/mindspore-lite)|(github.com/mindspore-ai/(mindspore|docs))|" + \ +# r"(mindspore.cn/(docs|tutorials|lite))|(obs.dualstack.cn-north-4.myhuaweicloud)|" + \ +# r"(mindspore-website.obs.cn-north-4.myhuaweicloud))[\w\d/_.-]*?)/(master)" + +# re_url2 = r"(gitee.com/mindspore/mindspore[\w\d/_.-]*?)/(master)" + +# re_url3 = r"(((gitee.com/mindspore/mindformers)|(mindspore.cn/mindformers))[\w\d/_.-]*?)/(dev)" + if os.path.exists('../../../tools/generate_html/version.json'): with open('../../../tools/generate_html/version.json', 'r+', encoding='utf-8') as f: version_inf = json.load(f) @@ -391,6 +399,13 @@ for cur, _, files in os.walk(des_sir): with open(os.path.join(cur, i), 'r+', encoding='utf-8') as f: content = f.read() new_content = content + # 发版本时这里启用 + # new_content = re.sub(re_url, r'\1/r2.7.0rc1', new_content) + # new_content = re.sub(re_url4, r'\1/r1.6.0', new_content) + # if i.endswith('.rst'): + # new_content = re.sub(re_url2, r'\1/v2.7.0-rc1', new_content) + + # master使用 if i.endswith('.md'): md_view = f'[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/{docs_branch}/resource/_static/logo_source.svg)](https://gitee.com/mindspore/{copy_repo}/blob/{branch}/' + copy_path + cur.split('api_python')[-1] + '/' + i + ')\n\n' if 'resource/_static/logo_source' not in new_content: @@ -399,8 +414,11 @@ for cur, _, files in os.walk(des_sir): f.seek(0) f.truncate() f.write(new_content) + except Exception: print(f'打开{i}文件失败') + + # master使用 if i.endswith('.rst'): try: with open(os.path.join(cur, i), 'r+', encoding='utf-8') as f: @@ -418,6 +436,12 @@ for cur, _, files in os.walk(des_sir): except Exception: print(f'打开{i}文件失败') +# # Rename .rst file to .txt file for include directive. (发版本时这里启用) +# from rename_include import rename_include + +# rename_include('api_python') +# rename_include('migration_guide') + # rename file name to solve Case sensitive. rename_list = [("./api_python/ops/", "func_", ""), @@ -433,7 +457,7 @@ try: except Exception as e: print(e) -del_redundant_api_file(des_sir, ['mindspore.ops.rst', 'mindspore.ops.primitive.rst'], f'ops', 'mindspore.ops.', ops_del+primi_del, 'ops.silent_check.') +del_redundant_api_file(des_sir, ['mindspore.ops.rst', 'mindspore.ops.primitive.rst'], 'ops', 'mindspore.ops.', ops_del+primi_del, 'ops.silent_check.') del_redundant_api_file(des_sir, ['mindspore.nn.rst'], 'nn', 'mindspore.nn.', nn_del, 'optim_') del_redundant_api_file(des_sir, ['mindspore.mint.rst'], 'mint', 'mindspore.mint.', mint_del) del_redundant_api_file(des_sir, ['mindspore.numpy.rst'], 'numpy', 'mindspore.numpy.', numpy_del) @@ -665,5 +689,7 @@ else: content = content[0] with open(des_release, "w", encoding="utf-8") as p: + # content = re.sub(re_url, r'\1/r2.7.0rc1', content) + # content = re.sub(re_url2, r'\1/v2.7.0-rc1', content) p.write("# Release Notes" + "\n\n" + release_source) p.write(content) diff --git a/docs/mindspore/source_zh_cn/faq/network_compilation.md b/docs/mindspore/source_zh_cn/faq/network_compilation.md index 233a8a87c409967932be66d200fe8e00cd28a58c..739c48f38125e03df8a1cc693fede4ec0ee4aa7f 100644 --- a/docs/mindspore/source_zh_cn/faq/network_compilation.md +++ b/docs/mindspore/source_zh_cn/faq/network_compilation.md @@ -411,7 +411,7 @@ A: “External” 类型表示在图模式中使用了无法原生支持的对 ## Q: 编译时报错`Nested execution during JIT execution for 'xxx' is not supported when 'xxx' compile and execute.`怎么办? -A: 当触发编译流程,即代码编译成静态计算图时,见[Graph模式执行原理](https://www.mindspore.cn/docs/zh-CN/master/features/program_form/overview.html),同时在默认使用JIT Fallback特性时,再次进入编译流程时,则会抛出以上异常。 +A: 当触发编译流程,即代码编译成静态计算图时,同时在默认使用JIT Fallback特性时,再次进入编译流程时,则会抛出以上异常。 下面以JIT Fallback支持调用第三方库的对象和方法为例: diff --git a/docs/mindspore/source_zh_cn/features/compile/graph_optimization.md b/docs/mindspore/source_zh_cn/features/compile/compilation_guide_zh.md similarity index 33% rename from docs/mindspore/source_zh_cn/features/compile/graph_optimization.md rename to docs/mindspore/source_zh_cn/features/compile/compilation_guide_zh.md index e3d8f794240ffb56d96374c162acfb3da213de7f..fa0411c9fd48ab1f921b66f21d0e83464ecbd99e 100644 --- a/docs/mindspore/source_zh_cn/features/compile/graph_optimization.md +++ b/docs/mindspore/source_zh_cn/features/compile/compilation_guide_zh.md @@ -1,22 +1,83 @@ -# 图优化(编译) +# mindspore.jit 多级编译优化 -[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/master/docs/mindspore/source_zh_cn/features/compile/graph_optimization.md) +[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/master/docs/mindspore/source_zh_cn/features/compile/compilation_guide_zh.md) -与传统编译器类似,MindSpore 在进行完构图之后,也会进行编译优化。编译优化的主要目的是通过静态分析技术对 MindSpore 的中间表示 MindIR 进行分析和转换,以达成减小目标代码大小、提升代码执行效率、降低运行时资源开销或者提升其它性能指标的目的。编译优化是图编译系统中的重要一环,对提升整个神经网络模型的性能和资源利用率有着极其重要的意义,相较于未经过编译优化的原始代码,编译优化可能带来数倍甚至数十倍的性能提升。 -本节主要介绍独立于特定硬件的前端编译优化技术,特定于硬件的后端编译优化技术不在本节的讨论范围之内。 +## MindSpore编译架构 + +MindSpore利用jit(just-in-time)来进行性能优化。jit模式会通过AST树解析、Python字节码解析或追踪代码执行的方式,将python代码转换为中间表示图(IR,Intermediate Representation)。我们给它命名MindIR。编译器通过对该IR图的优化,来达到对代码的优化,提高运行性能。与动态图模式相对应,这种JIT的编译模式被称为graph mode。 -## 前端编译优化技术原理 +开发者写的python代码默认以动态图模式运行,可以通过`@mindspore.jit`装饰器修饰函数,来指定其按照graph mode执行。有关`@mindspore.jit`装饰器的相关文档请见[jit 文档](https://www.mindspore.cn/docs/zh-CN/master/api_python/mindspore/mindspore.jit.html#mindspore.jit)。 -与传统编译优化技术类似,MindSpore 中的编译优化也是通过一个个 Pass 来完成的。将每个 Pass 的上一个 Pass 所产生的 MindIR 作为输入,经过本 Pass 优化之后,产生新的 MindIR 表示作为输出。一个大的 Pass 可以包含多个小的 Pass,每个小的 Pass 只负责单点的编译优化,如:代数化简、函数内联(inline)、冗余消除等。一个 Pass 产生的优化结果,可能会为其它的 Pass 带来优化机会,故可以循环运行这些 Pass,直到产生的 MindIR 不再发生变化为止。 +graph mode大致分为3个阶段: + - 图捕获(构图): python代码 -> MindIR。 + - 图优化(前端): 对MindIR进行硬件无关优化,代数化简、函数inline(内联)、冗余消除等。 + - 图优化(后端): 对MindIR进行硬件相关优化,LazyInline,算子选择,图算融合等。 + +## 图捕获(构图) + +MindSpore提供三种捕获方式,如下 + - AST: 通过AST树解析的方式将执行的函数转换成IR图 + - bytecode(实验性): 对Python字节码的解析,尽可能的构建IR图,无法转换为IR图的部分则会按照动态图进行执行 + - trace(实验性): 通过追踪Python代码执行的轨迹来构建IR图 + +这三种模式在mindspore.jit中使用capture_mode来选择,以ast举例: 开发者可用`@mindspore.jit(capture_mode="ast")`装饰器修饰函,用ast方式修饰的函数,其语法有一定限制,我们提供两种模式供开发者选择。 +- strict模式:此模式目标是构成一张图,开发者的python代码如果无法构图,选择此模式运行程序时会报错,需要开发者进行代码修改,变为可构图的语法,适合追求性能的开发者。 +- lax模式:此模式目标是尽可能的让开发者程序可运行,思路是针对无法在strict模式构图的代码进行python fallback,即返回python层运行。 + +graph mode模式约束请参考[语法约束](https://www.mindspore.cn/tutorials/zh-CN/master/compile/static_graph.html)。ast如何将python代码解析并构图,举例如下: + +```python +@mindspore.jit +def foo(x, y): + z = x + y + return z +``` + +它对应的抽象语法树如下: + +![抽象语法树](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/docs/mindspore/source_zh_cn/features/compile/images/ast.png) + +通过解析上面的抽象语法树,我们得到下面的IR: + +```text +%para1_x: +%para2_y: + +subgraph instance: foo +subgraph @foo() { + %0(CNode_17) = PrimFunc_Add(%para1_x, %para2_y) + : (, ) -> () + Return(%0) + : () +} +``` + +**ast的优点**: + +- 使用ast模式,用户的编程自主性更强,性能优化更精准,可以根据函数特征以及使用经验将网络的性能调至最优。 + +**ast的限制**: + +- ast修饰的函数,其内部的语法必须严格遵守静态图语法来进行编程。 + +**ast模式的使用建议**: + +- 相比于动态图执行,被`@mindspore.jit`修饰的函数,在第一次调用时需要先消耗一定的时间进行编译。在该函数的后续调用时,若原有的编译结果可以复用,则会直接使用原有的编译结果进行执行。因此,使用@mindspore.jit装饰器修饰会多次执行的函数通常会获得更多的性能收益。 -编译优化过程中,选择运行哪些 Pass,如何安排这些 Pass 的执行顺序对生成的最终的编译结果有着非常重要的影响。可以按照实际情况,通过设定编译优化策略(如优化级别、次数等)来对即将执行的优化动作进行调整。 +- graph mode的运行效率优势体现在其会将被@mindspore.jit修饰函数进行全局上的编译优化,函数内含有的操作越多,优化的空间越大。因此`@mindspore.jit`装饰器修饰的函数最好是内含操作很多的大代码块,而不应将很多细碎的、仅含有少量操作的函数分别打上jit标签。否则,则可能会导致性能没有收益甚至劣化。 -## 常见前端编译优化技术 +- 绝大部分计算以及优化都是基于对Tensor计算的优化,建议被修饰的函数应该是用来进行真正的数据计算的函数,而不是一些简单的标量计算或者数据结构的变换。 -前端编译优化技术有很多,如:代数化简、函数inline(内联)、冗余消除等。本节将介绍部分具有代表性的编译优化技术。 +- 被`@mindspore.jit`修饰的函数,若其输入存在常量,那么该函数每次输入值的变化都会导致重新编译,关于变量常量的概念请见[即时编译下的常量与变量](https://www.mindspore.cn/tutorials/zh-CN/master/compile/static_graph)。因此,建议被修饰的函数以Tensor或者被Mutable修饰的数据作为输入。避免因多次编译导致的额外性能损耗。 -### 代数化简 +## 图优化(前端) + +与传统编译优化技术类似,MindSpore 中的编译优化也是通过一个个 Pass 来完成的。将每个 Pass 的上一个 Pass 所产生的 MindIR 作为输入,经过本 Pass 优化之后,产生新的 MindIR 表示作为输出。一个大的 Pass 可以包含多个小的 Pass,每个小的 Pass 只负责单点的编译优化,如:代数化简、函数内联(inline)、冗余消除等。一个 Pass 产生的优化结果,可能会为其它的 Pass 带来优化机会,故可以循环运行这些 Pass,直到产生的 MindIR 不再发生变化为止。 + +前端编译优化技术有很多,如: 代数化简、函数inline(内联)、冗余消除等。这里仅介绍具有代表性的编译优化技术。 + +### 1 代数化简 在传统编译器中,代数化简是一种编译器优化技术,旨在简化源代码中的代数表达式,消除多余计算,提高程序执行效率、减少内存占用等。 @@ -42,13 +103,13 @@ c = y; ```python import numpy as np -from mindspore.common import Tensor, jit +import mindspore -@jit +@mindspore.jit def func(x): return x + 0 -m = Tensor(np.array([[1, 2, 3], [4, 5, 6]]).astype(np.int32)) +m = mindspore.tensor(np.array([[1, 2, 3], [4, 5, 6]]).astype(np.int32)) out = func(m) ``` @@ -79,7 +140,7 @@ subgraph @1_func_14() { 代数化简能更多地涉及对计算图结构的修改,它通常还与其他编译器优化技术(如常量折叠、常量传播等)结合使用,共同提高程序性能。 -### 函数inline +### 2 函数inline 在传统编译器中,inline(内联)是一种优化技术,可以把被调用函数的代码直接替换到调用该函数的位置,提高程序运行效率。假设我们有一个 C++ 函数`add`,用于对两个数求和: @@ -108,21 +169,21 @@ int main() { 在 AI 框架的计算图编译器中,inline 的目标类似,但操作对象从“函数”变成了“子图”(subgraph)。假设我们有一个 Python 程序: ```python -from mindspore import Tensor, jit, ops +from mindspore -def f2(x: Tensor, y: Tensor): +def f2(x: mindspore.Tensor, y: mindspore.Tensor): return x * 0.5 + y -@jit -def f1(a: Tensor, b: Tensor, c: Tensor): +@mindspore.jit +def f1(a: mindspore.Tensor, b: mindspore.Tensor, c: mindspore.Tensor): x = f2(a, b) y = f2(a, c) return x + y # 创建3个shape=(2, 4)的随机值Tensor -a = ops.randn(2, 4) -b = ops.randn(2, 4) -c = ops.randn(2, 4) +a = mindspore.ops.randn(2, 4) +b = mindspore.ops.randn(2, 4) +c = mindspore.ops.randn(2, 4) out = f1(a, b, c) ``` @@ -189,59 +250,22 @@ subgraph @f1() { 通过 inline 将子图展开,编译器能够更清晰地识别跨子图的优化机会,除了公共子表达式消除 (CSE),还能够触发算子融合、内存管理等许多优化措施。因此 inline 是计算图编译器的一项重要优化机制,也是许多跨图优化的基础。 -### 冗余消除 +### 3 冗余消除 在传统编译器中,冗余消除包含了多种编译优化技术,旨在通过在编译期间识别出代码中存在冗余的部分并进行消除,达到减少不必要的计算,提高程序的执行效率的目的。 通常冗余代码可能是用户出于可读性等目的有意编写的,也可能仅仅是编码过程中的无心之举。此外,编译优化过程本身通过其它优化技术(如:代数化简、inline、公共子表达式消除等)产生的中间结果,也可能带来冗余消除的机会。 -冗余消除的技术有很多,本节挑选了其中常见的无用代码消除、不可达代码消除进行介绍。 - -1. **无用代码消除** - - 消除计算结果未被使用的代码。例如:下面的 C++ 代码中,变量 `c` 未被任何其它代码使用,编译器可以通过静态分析领域的数据流分析等技术,将计算 `int c = x * y` 的这行代码消除。 - - ```cpp - int func(x, y) { - int a = x + y; - int b = x - y; - int c = x * y; // 无用代码 - int d = a / b; - return d; - } - ``` - -2. **不可达代码消除** - - 消除未被有效控制流路径包含的代码。例如:下面的 C++ 代码中,编译器可以通过静态分析领域的控制流分析技术,分析代码的控制流图,识别到表达式 `1 < 0` 恒不成立,从而控制流 `1 < 0` 包含的代码在实际运行期间必定不会被执行,故可将该分支的代码消除。 - - ```cpp - int func(x, y) { - int a = x + y; - - int b; - if 1 < 0 { // 不可达分支 - b = x + y; - } else { - b = x - y; - } - - int d = a / b; - return d; - } - ``` - -MindSpore 图模式下冗余消除的目的及使用的技术也类似。与传统编译器不同的是,这些冗余优化技术是在 MindIR 上完成的。类似的,MindSpore 中常见的冗余消除技术有: +MindSpore冗余消除的目的及使用的技术与传统编译器类似。不同的是这些冗余优化是在 MindIR 上完成的。例如: 1. **无用代码消除** 假设有如下存在冗余计算的Python代码: ```python - import mindspore as ms - from mindspore.common import Tensor, jit - - @jit + import mindspore + + @mindspore.jit def func(x, y): a = x + y b = x - y @@ -249,12 +273,12 @@ MindSpore 图模式下冗余消除的目的及使用的技术也类似。与传 d = a / b return d - x = Tensor(20, ms.float32) - y = Tensor(10, ms.float32) + x = mindspore.tensor(20, mindspore.float32) + y = mindspore.tensor(10, mindspore.float32) out = func(x, y) ``` - MindSpore 图编译器会通过静态分析将 `@jit` 修饰的 Python 代码转换为 MindIR 的表示形式并消除其中冗余的 `c = x * y` 的计算,最终生成的 MindIR 如下: + MindSpore 图编译器会通过静态分析将 `@mindspore.jit` 修饰的 Python 代码转换为 MindIR 的表示形式并消除其中冗余的 `c = x * y` 的计算,最终生成的 MindIR 如下: ```text # Params: @@ -278,10 +302,9 @@ MindSpore 图模式下冗余消除的目的及使用的技术也类似。与传 假设有如下存在不可达路径的Python代码: ```python - import mindspore as ms - from mindspore.common import Tensor, jit + import mindspore - @jit + @mindspore.jit def func(x, y): a = x + y if 1 < 0: # 不可达分支 @@ -291,12 +314,12 @@ MindSpore 图模式下冗余消除的目的及使用的技术也类似。与传 d = a / b return d - x = Tensor(20, ms.float32) - y = Tensor(10, ms.float32) + x = mindspore.tensor(20, mindspore.float32) + y = mindspore.tensor(10, mindspore.float32) out = func(x, y) ``` - MindSpore 图编译器会通过静态分析将 `@jit` 修饰的 Python 代码转换为 MindIR 的表示形式并消除其中冗余的控制流分支 `1 < 0` 的代码,最终生成的 MindIR 如下: + MindSpore图编译器会通过静态分析将 `@mindspore.jit` 修饰的 Python 代码转换为 MindIR 的表示形式并消除其中冗余的控制流分支 `1 < 0` 的代码,最终生成的 MindIR 如下: ```text # Params: @@ -316,3 +339,109 @@ MindSpore 图模式下冗余消除的目的及使用的技术也类似。与传 ``` 冗余消除在编译优化中扮演着重要的角色,在不改变程序原语义的前提下,能够显著提高程序的执行效率,通过减少不必要的运行时计算节省计算资源。冗余消除通常还与其它编译优化技术结合使用以获得更多消除冗余代码的机会。 + +## 图优化(后端) + +当MindIR图经过前端优化完成后,需要进行进一步优化(包含目标硬件)。优化模式我们分为O0,O1,用参数jit_level表示 + - **jit_level=O0**: 只做基本的图切分优化,以及算子选择(硬件相关),优点是可以保证IR图的原始结构,编译速度较快。 + - **jit_level=O1**: 增加图优化和自动算子融合,编译性能有所损失,但模型开始训练后,效率较高 + +MindIR经过本轮优化后,会由runtime模块进行执行,涉及多级流水并发等技术,可参考[多级流水] + +### jit_level=O0 模式 + +O0模式的优化较少,基础的优化主要为后端LazyInline和No-task node执行优化。 + +- **LazyInline**: 主要思想是将函数调用的开销推迟到实际需要调用的时候,这样可以减少编译时的开销,提高编译效率。LazyInline在图编译阶段是将相同的子图结构复用,不展开放在图中,避免图规模较大导致影响编译性能。 + + ![jit_level_lazyinline](./images/multi_level_compilation/jit_level_lazyinline.png) + +- **No-task node执行优化**: No-task node指的是Reshape、ExpandDims、Squeeze、Flatten、FlattenGrad、Reformat等诸类算子没有计算逻辑,不修改内存排布,仅修改shape、format等信息。在图编译结束后,将No-task node转换成ref node,输出跟输入同地址,执行过程中跳过kernel launch,从而达到执行性能优化目的。 + + ![jit_level_no_task](./images/multi_level_compilation/jit_level_no_task.png) + +#### 算子选择 + +算子是深度学习框架中的基本执行单元,它们负责执行特定的计算任务,如矩阵乘法、卷积、池化等。算子选择需要综合考虑算子类型、数据类型、硬件平台和算子优化等因素,以选择最优的算子来实现模型运行效率最高。 + +MindSpore 在Ascend硬件的算子类型有aclnn kernel/aclop kernel/hccl kernel/cpu kernel,算子选择流程如下图所示: + +![jit_level_kernelselect](./images/multi_level_compilation/jit_level_kernelselect.png) + +1. 算子类型:首先根据算子类型选择为计算算子还是通信算子。 +2. 硬件平台:如果硬件上有对应算子,则优先选择硬件上的算子,否则选择CPU上的算子(异构),例如shape相关的计算算子可能只适合在CPU上支持,没有对应的硬件算子。 +3. 算子效率:ascend硬件由于aclnn算子较好的性能,因此计算类型算子如果有对应aclnn kernel,则优先选择aclnn kernel,否则就选择aclop kernel。 +4. 如果上述3步都未选择到算子,则为不支持的算子,算子选择失败报错。 + +#### 执行序编排 + +不同图遍历算法产生的执行序在执行性能跟内存上会有较大的差异,如图所示: +![jit_level_exec_order](./images/multi_level_compilation/jit_level_exec_order.png) + +- **BFS得到的执行序**:kernel1-> kernel2-> kernel4-> kernel5-> kernel3-> kernel6,内存峰值为5G(kernel3执行后可以把kernel1和kernel2的释放掉,则轮到kernel6执行的时候则能复用,因此kernel6 不用额外申请多的内存)。 +- **DFS得到的执行序**:kernel1-> kernel2-> kernel3-> kernel4-> kernel5-> kernel6,内存峰值为4G(kernel3执行后可以把kernel1和kernel2的释放掉,则轮到kernel4和kernel5执行的时候则能复用,因此kernel4和kernel5不用额外申请多的内存)。 + +执行序编排是在一定内存限制下求解最优算子并发的复杂性问题,不仅需要识别和利用计算图中的并发机会,以提升计算效率,还必须同时考虑多种限制条件,以确保系统的稳定性和高效性。 + +- 首先,优化模块需要解决求解最优算子并发的复杂性问题。由于计算图中的算子数量庞大且相互依赖,找到一个既能最大化并发又能保持计算图逻辑正确性的执行顺序是一个极具挑战性的任务。 +- 其次,内存限制是执行序优化中不可忽视的关键因素。增大并发虽然可以提升计算效率,但往往会显著增加峰值内存需求,从而可能导致内存溢出(OOM)错误,尤其是在资源受限的环境中。因此,优化模块必须权衡并发与内存使用之间的关系,确保在提升并发的同时,不会超出系统的内存容量。 +- MindSpore的执行序调整模块结合了基于规则和基于启发式策略的方式,提供bfs/dfs两种执行序编排算法[mindspore.jit(option={"exec_order":"bfs/dfs"})](https://www.mindspore.cn/docs/zh-CN/master/api_python/mindspore/mindspore.jit.html),以实现对计算图执行顺序的精细调整,从而在保证计算效率的同时,有效应对内存限制和系统稳定性等多重挑战。 + +### jit_level=O1 模式 + + 当前O1主要支持了图算融合优化。其主要思路是:在编译阶段,自动识别计算图中相邻的可融合节点,然后将其融合为更大粒度的可执行算子。通过图算融合,实现增加算子计算局部性、减少整体全局内存访存带宽开销等优化效果。通过对主流SOTA模型的实测验证,O1能够实现相比O0平均15%的性能加速。特别是对于访存密集型网络,O1优化效果更加显著。 + +#### 图算融合 + +MindSpore等主流AI计算框架对用户提供的算子通常是从用户可理解、易使用角度进行定义。每个算子承载的计算量不等,计算复杂度也各不相同。但从硬件执行角度看,这种天然的、基于用户角度的算子计算量划分,并不高效,也无法充分发挥硬件资源计算能力。主要体现在: + +1. 计算量过大、过复杂的算子,通常很难生成切分较好的高性能算子,从而降低设备利用率; +2. 计算量过小的算子,由于计算无法有效隐藏数据搬移开销,也可能会造成计算的空等时延,从而降低设备利用率; +3. 硬件Device通常为多核、众核结构,当算子shape较小或其他原因引起计算并行度不够时,可能会造成部分核的空闲,从而降低设备利用率。特别是基于专用处理器架构(Domain Specific Architecture,后文简称DSA)的芯片对这些因素更为敏感。如何最大化发挥硬件算力性能的同时使算子也能具备较好的易用性,一直以来是一个很大的挑战。 + +在AI框架设计方面,目前业界主流采用图层和算子层分层的实现方法。图层负责对计算图进行融合或重组,算子层负责将融合或重组后的算子编译为高性能的可执行算子。图层通常采用基于Tensor的High-Level IR的处理和优化,算子层则采用基于计算指令的Low-Level IR进行分析和优化。 这种人为分层处理显著增加了图、算两层进行协同优化的难度。 + +MindSpore在过去几年的技术实践中,采用了图算融合的技术来较好的解决了这个问题。NLP、推荐等不同类别的典型网络在使能图算融合后训练速度都有明显收益。主要原因之一就是这些网络中存在大量小算子组合,具有较多的融合优化机会。 + +#### 图算融合架构及整体流程 + +图算融合整体架构如下图所示。在图层主要思路是把复合算子打开,然后进行跨边界聚合和优化,最后进行Kernel算子拆分。主要步骤包括: + +1. Composite Expansion:将复合算子展开为基本算子,并构成Composite子图,方便进行后续的跨边界优化和算子拆分; +2. Cross-OP Aggregation:将相邻的基本算子或Composite子图进行聚合,从而构成更大的聚合子图,方便进行后续的跨边界优化和算子拆分; +3. High-Level Optimization:在上面两步得到的聚合子图的基础上,我们可以进行大量的跨边界优化,如代数化简、公共子表达式提取(CSE)等; +4. Kernel Partition:基于计算特征以及融合算子性能,对聚合计算子图进行算子拆分。 + +优化后的计算图会以一个个子图的方式传给MindSpore AKG继续进行后端优化、生成目标代码。 + +![graphkernel](./images/graphkernel.png) + +通过以上步骤,我们可以获得两方面性能收益: + +1. 不同算子之间的跨边界性能优化收益; +2. 通过对整个计算图进行重组拆分,得到最优粒度的融合算子。 + +#### 融合算子加速优化(MindSpore AKG) + +前文提到,在HPC、深度神经网络训练等场景中,图算融合优化可带来成倍的性能提升。但随着图算融合能力的不断增强,融合算子的开发成为了继续提升图算融合能力的瓶颈点。 + +融合算子的自动生成技术可以解决基于DSA开发融合算子编程门槛较高的问题,让程序员在算子开发过程中能够聚焦于算子的实现逻辑,无需关注后端优化,极大提高其开发效率。尤其对于后端硬件架构复杂以及存在复杂算子和融合算子的场景,算子自动生成技术更加关键。 + +因此,**MindSpore AKG基于多面体编译技术(Polyhedral Model),对融合算子的加速优化与自动生成**,能够帮助MindSpore的图算融合模块优化后的融合算子在**异构硬件平台**(GPU/Ascend)上自动生成高性能的kernel,提升MindSpore的训练性能。 + +架构及整体流程如下: + +![graphkernel_akg_overview](./images/graphkernel_akg_overview.png) + +MindSpore AKG的整体框架如上图所示: + +- IR规范化 + - MindSpore AKG的输入为MindSpore图算融合模块优化后的融合子图,通过TVM的Compute / IR Builder / Hybrid 等多种描述方式对子图中的算子进行表达。然后DSL会被转换为 Halide IR([Halide](https://halide-lang.org/),是常见的用于开发高性能图像处理和Array计算的语言,可作为中间表达解耦算法和优化)并进行 IR 规范化; + - 完成初步简化和优化后,Halide IR会被转化为Poly模块所需的调度树; +- Poly模块调度优化 + - 利用Polyhedral技术中的Pluto调度算法,实现循环的自动融合、自动重排等变换,为融合算子自动生成满足并行性、数据局部性的初始调度; + - 为快速适配不同硬件后端,Poly模块内的优化pass会分为硬件无关的通用优化与硬件相关的特定优化,编译时按照硬件特征拼接组合,实现异构硬件后端的快速适配。自动切分、自动映射以及自动内存提升等pass会根据不同硬件的架构性质给出不同的优化方式; +- 后端优化 + - 为了进一步提升算子的性能,我们针对不同硬件后端开发了相应的优化pass,如Ascend后端中实现数据对齐、指令映射,GPU后端中实现向量化存取,插入同步指令等,最终生成相应平台代码。 + +总结: MindSpore编译从图捕获模式,IR优化图算融合等各维度对AI模型代码进行优化,很多特性在易用性和性能方面的取舍也有一定挑战。我们也规划进一步分层解耦整个流程,避免黑盒运行,增加开发者理解的门槛。 \ No newline at end of file diff --git a/docs/mindspore/source_zh_cn/features/compile/graph_construction.ipynb b/docs/mindspore/source_zh_cn/features/compile/graph_construction.ipynb deleted file mode 100644 index a8ed39198a1f1a458412c89e5f310d68c67c760e..0000000000000000000000000000000000000000 --- a/docs/mindspore/source_zh_cn/features/compile/graph_construction.ipynb +++ /dev/null @@ -1,273 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 构图(编译)\n", - "\n", - "[![下载Notebook](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_notebook.svg)](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/notebook/master/zh_cn/features/compile/mindspore_graph_construction.ipynb) [![下载样例代码](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_download_code.svg)](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/notebook/master/zh_cn/features/compile/mindspore_graph_construction.py) [![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/master/docs/mindspore/source_zh_cn/features/compile/graph_construction.ipynb)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "MindSpore提供JIT(just-in-time)技术来进行性能优化。JIT模式会通过AST树解析、Python字节码解析或追踪代码执行的方式,将代码解析为一张中间表示图(IR,intermediate representation)。IR图作为该代码的唯一表示,编译器通过对该IR图的优化,来达到对代码的优化,提高运行性能。与动态图模式相对应,这种JIT的编译模式被称为静态图模式。\n", - "\n", - "基于JIT技术,MindSpore提供了动静结合的方法来提高用户的网络的运行效率。动静结合,即在整体运行为动态图的情况下,指定某些代码块以静态图的方式运行。按照静态图方式运行的代码块会采取先编译后执行的运行模式,在编译期对代码进行全局优化,来获取执行期的性能收益。用户可以通过`@jit`装饰器修饰函数,来指定其按照静态图的模式执行。有关`@jit`装饰器的相关文档请见[jit API文档](https://www.mindspore.cn/docs/zh-CN/master/api_python/mindspore/mindspore.jit.html#mindspore.jit)。\n", - "\n", - "MindSpore提供了三种JIT编译方式,分别通过ast、bytecode和trace的方式来构图。ast是通过AST树解析的方式,将用户手工标识需要按照ast方式执行的函数转换成静态图。bytecode则是通过对Python字节码的解析,在动态图中尽可能的构建静态图,无法转换为静态图的部分则会按照动态图进行执行,来达到动静结合的目的。trace是通过追踪Python代码执行的轨迹来构建静态图,当前属于实验性质的特性。后续介绍会详细说明三者原理的不同以及各自的特点。\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Ast\n", - "\n", - "在动态图模式下,用户可以通过`@jit(capture_mode=\"ast\")`装饰器修饰函数来让该函数以ast方式来执行。用ast方式修饰的函数,其内部使用的语法以及数据结构需要遵守静态图语法规范[静态图语法规范](https://www.mindspore.cn/tutorials/zh-CN/master/compile/static_graph.html)。ast方式通过源到源的方式来编译Python代码,先把模型定义的Python源码解析成抽象语法树,然后把抽象语法树解析为MindIR。例如下面的Python代码:\n", - "\n", - "```python\n", - "@jit\n", - "def foo(x, y):\n", - " z = x + y\n", - " return z\n", - "```\n", - "\n", - "它对应的抽象语法树如下:\n", - "\n", - "![抽象语法树](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/docs/mindspore/source_zh_cn/features/compile/images/ast.png)\n", - "\n", - "通过解析上面的抽象语法树,我们得到下面的MindIR:\n", - "\n", - "```text\n", - "%para1_x: \n", - "%para2_y: \n", - "\n", - "subgraph instance: foo\n", - "subgraph @foo() {\n", - " %0(CNode_17) = PrimFunc_Add(%para1_x, %para2_y)\n", - " : (, ) -> ()\n", - " Return(%0)\n", - " : ()\n", - "}\n", - "```\n", - "\n", - "**ast的使用方法**:\n", - "\n", - "用户可以通过`@jit`装饰器来指定函数以静态图的方式来执行,例如:" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[[4. 4. 4. 4.]\n", - " [4. 4. 4. 4.]]" - ] - } - ], - "source": [ - "import numpy as np\n", - "import mindspore as ms\n", - "from mindspore import ops\n", - "from mindspore import jit\n", - "from mindspore import Tensor\n", - "\n", - "@jit\n", - "def tensor_cal(x, y, z):\n", - " return ops.matmul(x, y) + z\n", - "\n", - "x = Tensor(np.ones(shape=[2, 3]), ms.float32)\n", - "y = Tensor(np.ones(shape=[3, 4]), ms.float32)\n", - "z = Tensor(np.ones(shape=[2, 4]), ms.float32)\n", - "ret = tensor_cal(x, y, z)\n", - "print(ret)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "上述用例中,tensor_cal函数被@jit装饰器修饰,该函数被调用时就会按照静态图的模式进行执行,以获取该函数执行期的性能收益。\n", - "\n", - "**ast的优点**:\n", - "\n", - "- 使用ast模式,用户的编程自主性更强,性能优化更精准,可以根据函数特征以及使用经验将网络的性能调至最优。\n", - "\n", - "**ast的限制**:\n", - "\n", - "- ast修饰的函数,其内部的语法必须严格遵守静态图语法来进行编程。\n", - "\n", - "**ast模式的使用建议**:\n", - "\n", - "- 相比于动态图执行,被`@jit`修饰的函数,在第一次调用时需要先消耗一定的时间进行静态图的编译。在该函数的后续调用时,若原有的编译结果可以复用,则会直接使用原有的编译结果进行执行。因此,使用@jit装饰器修饰会多次执行的函数通常会获得更多的性能收益。\n", - "\n", - "- 静态图模式的运行效率优势体现在其会将被@jit修饰函数进行全局上的编译优化,函数内含有的操作越多,优化的上限也就越高。因此`@jit`装饰器修饰的函数最好是内含操作很多的大代码块,而不应将很多细碎的、仅含有少量操作的函数分别打上jit标签。否则,则可能会导致性能没有收益甚至劣化。\n", - "\n", - "- MindSpore静态图绝大部分计算以及优化都是基于对Tensor计算的优化,因此我们建议被修饰的函数应该是那种用来进行真正的数据计算的函数,而不是一些简单的标量计算或者数据结构的变换。\n", - "\n", - "- 被`@jit`修饰的函数,若其输入存在常量,那么该函数每次输入值的变化都会导致重新编译,关于变量常量的概念请见[即时编译下的常量与变量](https://www.mindspore.cn/tutorials/zh-CN/master/compile/static_graph.html#%E5%8D%B3%E6%97%B6%E7%BC%96%E8%AF%91%E4%B8%8B%E7%9A%84%E5%B8%B8%E9%87%8F%E4%B8%8E%E5%8F%98%E9%87%8F)。因此,建议被修饰的函数以Tensor或者被Mutable修饰的数据作为输入。避免因多次编译导致的额外性能损耗。" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Bytecode\n", - "\n", - "除了ast,MindSpore提供另外一种静态化加速机制bytecode,用户可以通过`@jit(capture_mode=\"bytecode\")`装饰器修饰函数来让该函数以bytecode模式来执行。当bytecode识别到不支持进入静态图的语法时,会回退到Python执行而非直接编译报错。该功能同时兼顾性能和易用性,减少编译报错的发生。它基于Python字节码的分析,对Python的执行流进行图捕获,让可以以静态图方式运行的子图以静态图方式运行,并让Python语法不支持的子图以动态图方式运行,同时通过修改调整字节码的方式链接动静态图,达到动静混合执行。在满足易用性的前提下,尽可能地提高性能。\n", - "\n", - "**bytecode的运行原理**:\n", - "\n", - "1. 基于Python虚拟机_PyInterpreterState_SetEvalFrameFunc捕获Python函数的执行,采用上下文管理的方式捕获执行区域内的所有Python函数执行。\n", - "2. 按照当前的运行时输入参数结合函数字节码进行分析,构造控制流图(CFG)以及数据流图(DFG)。\n", - "3. 模拟进栈出栈操作,跟踪逐个字节码,根据栈输入,推导输出。Python3.7~Python3.11每条字节码都有对应的模拟实现,注意是推导输出的类型尺寸,而不是真正执行得到值,除非常量折叠。\n", - "4. 在模拟执行字节码的过程中,将推导结果和操作翻译成MindIR,最后,通过常量折叠,UD分析(删除无用的输入输出参数)等方式,优化静态图。\n", - "5. 在执行等效的静态图之前,对输入参数和优化过程中产生的看护Guard条件进行比对,根据运行时信息,选择匹配的静态图执行。\n", - "6. 动态管理看护Guard和静态图缓冲的匹配关系,对不常用的静态图缓冲进行回收,通过Symbolic Shape和Dynamic Shape优化静态图缓冲。\n", - "\n", - "bytecode的编译流程如下图所示\n", - "\n", - "![bytecode的编译流程](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/docs/mindspore/source_zh_cn/features/compile/images/bytecode.png)\n", - "\n", - "**bytecode的使用方式**:\n", - "\n", - "将jit的capture_mode参数设置为bytecode,即可将修饰函数的运行模式切换为bytecode,例如:\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[[4. 4. 4. 4.]\n", - " [4. 4. 4. 4.]]" - ] - } - ], - "source": [ - "import numpy as np\n", - "import mindspore as ms\n", - "from mindspore import ops\n", - "from mindspore import jit\n", - "from mindspore import Tensor\n", - "\n", - "@jit(capture_mode=\"bytecode\")\n", - "def tensor_cal(x, y, z):\n", - " return ops.matmul(x, y) + z\n", - "\n", - "x = Tensor(np.ones(shape=[2, 3]), ms.float32)\n", - "y = Tensor(np.ones(shape=[3, 4]), ms.float32)\n", - "z = Tensor(np.ones(shape=[2, 4]), ms.float32)\n", - "ret = tensor_cal(x, y, z)\n", - "print(ret)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**bytecode的优点**:\n", - "\n", - "- 用户体验好,无需人工介入,用户编写的网络代码总是能够正常运行,静态图不能执行的代码会自动采用动态图运行。\n", - "- bytecode可以通过对字节码的变换,使得更多的语句进入静态图。用户无需感知或修改代码。\n", - "\n", - "**bytecode的限制**:\n", - "\n", - "- 用户无法明确对某些代码做性能加速,对于裂图较多的场景,性能加速的效果可能会不明显。" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Trace\n", - "\n", - "MindSpore也提供另外一种静态化加速机制trace,用户可以通过`@jit(capture_mode=\"trace\")`装饰器修饰函数来让该函数以trace模式来执行。在该模式下,代码会先以PyNative模式运行,在运行时调用的算子会被记录,并被捕获到计算图中。在后续执行该装饰器修饰的代码时,会直接执行第一次执行所构造出的计算图。该功能不会解析语法,只会捕获运行时调用的算子,因此不会有语法不支持报错的发生。它基于捕获运行PyNative模式时调用的算子,对Python的执行流进行图捕获,将捕获到的算子编入计算图中。没有对应算子的操作将无法生成节点,trace流程将只捕获该操作的返回值,在计算图中作为常量。生成的计算图以静态图的运行方式运行。\n", - "\n", - "**trace的使用方式**:\n", - "\n", - "将jit的capture_mode参数设置为trace,即可将修饰函数的运行模式切换为trace,例如:\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[[4. 4. 4. 4.]\n", - " [4. 4. 4. 4.]]" - ] - } - ], - "source": [ - "import numpy as np\n", - "import mindspore as ms\n", - "from mindspore import ops\n", - "from mindspore import jit\n", - "from mindspore import Tensor\n", - "\n", - "@jit(capture_mode=\"trace\")\n", - "def tensor_cal(x, y, z):\n", - " return ops.matmul(x, y) + z\n", - "\n", - "x = Tensor(np.ones(shape=[2, 3]), ms.float32)\n", - "y = Tensor(np.ones(shape=[3, 4]), ms.float32)\n", - "z = Tensor(np.ones(shape=[2, 4]), ms.float32)\n", - "ret = tensor_cal(x, y, z)\n", - "print(ret)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**trace的优点**:\n", - "\n", - "- 构图能力强,只要代码有对应算子就能够入图,不需要额外适配。构建静态图时不会有语法不支持报错。\n", - "- 用户体验好,无需人工介入,用户编写的网络代码总是能够正常运行。\n", - "\n", - "**trace的限制**:\n", - "\n", - "- 无法感知控制流,多次运行时控制流会进入不同分支的场景无法保证正确性。\n", - "- 没有定义为算子的操作,如第三方库会在计算图中被固定为常量,多次运行无法保证正确性。" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python [conda env:base] *", - "language": "python", - "name": "conda-base-py" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.7" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/docs/mindspore/source_zh_cn/features/compile/multi_level_compilation.md b/docs/mindspore/source_zh_cn/features/compile/multi_level_compilation.md deleted file mode 100644 index 0675dac4e7bf36a161bcfd117866089765510b1e..0000000000000000000000000000000000000000 --- a/docs/mindspore/source_zh_cn/features/compile/multi_level_compilation.md +++ /dev/null @@ -1,139 +0,0 @@ -# 多级编译介绍(编译) - -[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/master/docs/mindspore/source_zh_cn/features/compile/multi_level_compilation.md) - -## 背景 - -随着深度学习大模型时代的到来,网络规模越来越大,对图编译性能、执行性能和调试调优效率的挑战也越来越大。为此,MindSpore提出多级编译架构,提供O(n)多级编译执行模式,它们在图优化、算子融合、内存管理以及执行模式等方面有所不同,旨在提供图模式的多样性选择,用户可以根据自己的网络特点和需求,选择最适合的编译执行模式: - -1. O0模式:这是一个基础的编译执行模式,除必要影响功能的优化外,其他优化均关闭,使用单算子执行的执行方式。因此执行性能可能不是最优,但它的优点是可以保证图的原始结构,方便用户进行调试和理解,编译性能也较好。如下图中的Add和Mul单算子执行。 -2. O1模式:这个模式会进行一些基础的优化,比如常用图优化和自动算子融合优化,使用单算子执行的执行方式。相比O0,由于使能了融合优化,可以提高执行性能,但可能会影响到图的原始结构,因此编译性能和调试调优效率有所损失。如下图中的Add跟Mul融合成一个fused_op执行。 -3. O2模式:这是一个更高级的优化模式,目前没有实现,后续较为深层次的优化可使用该模式。 - -![jit_level_example](./images/multi_level_compilation/jit_level_example.png) - -## 多级编译架构概述 - -![jit_level_framework](./images/multi_level_compilation/jit_level_framework.png) - -1. 多级编译对外接口:通过[mindspore.jit(jit_level="O0/O1")](https://www.mindspore.cn/docs/zh-CN/master/api_python/mindspore/mindspore.jit.html#mindspore.jit)来配置多级编译级别,jit_level默认为O0,通常我们建议用户使用O0模式进行网络调试调优,调试就绪后,为了更好的性能可以一键开启O1运行网络。 -2. 后端图编译:根据配置的多级编译级别,选择不同的编译模式,O0为最基础的原生构图与编译,O1在O0基础增加了自动算子融合功能,主要功能有图优化、图算融合、算子选择、执行序编排,其中图算融合为O1模式下独有功能。 -3. 后端图执行:O0跟O1模式执行层面是一样的,均使用单算子方式调度执行,主要功能有多流并发、多级流水、HAL管理、内存管理。 - -## O0模式介绍 - -O0为基础的图编译执行模式,除必要影响功能的优化外,其他优化均关闭,使用原生的图结构进行编译和执行,方便调试调优,具备较好的编译性能。下面主要介绍后端图编译相关功能,后端图执行相关功能详见[运行时](https://www.mindspore.cn/docs/zh-CN/master/features/runtime/memory_manager.html)。 - -### 图优化 - -O0模式的图优化较少,基础的优化主要为后端LazyInline和No-task node执行优化。 - -- **后端LazyInline** - - **LazyInline**:主要思想是将函数调用的开销推迟到实际需要调用的时候,这样可以减少编译时的开销,提高编译效率。LazyInline在图编译阶段是将相同的子图结构复用,不展开放在图中,避免图规模较大导致影响编译性能。 - - ![jit_level_lazyinline](./images/multi_level_compilation/jit_level_lazyinline.png) - - **流水线(Pipeline)并行**:将神经网络中的算子切分成多个Stage,再把Stage映射到不同的设备上,使得不同设备去计算神经网络的不同部分。为了提升效率,流水线并行进一步将小批次(MiniBatch)切分成更细粒度的微批次(MicroBatch),在微批次中采用流水线式的调度,从而达到提升效率的目的。 - - **后端LazyInline**:由于Pipeline并行的MicroBatch切分会导致整个计算图扩张到MicroBatch的数量倍,从而导致模型规模巨大,编译性能时间较长(可能小时级别)。而这些Micro子图结构都是一样的,为了解决编译性能问题,LazyInline技术则非常契合,不过LazyInline带来的问题就是运行时无法采用最优的方式进行内存复用和流分配、无法做跨图的优化(内存优化、通信融合、算子融合等)等问题。为此,在图编译结束后,在图执行之前,将这些Micro子图做实际的节点Inline,以形成完整的全局整图,再通过图Inline后的内存优化、通信优化、冗余计算消除等方式,从而实现在编译性能、执行性能、执行内存方面都兼顾的目标。 - -- **No-task node执行优化** - - ![jit_level_no_task](./images/multi_level_compilation/jit_level_no_task.png) - - No-task node指的是Reshape、ExpandDims、Squeeze、Flatten、FlattenGrad、Reformat等诸类算子没有计算逻辑,不修改内存排布,仅修改shape、format等信息。在图编译结束后,将No-task node转换成ref node,输出跟输入同地址,执行过程中跳过kernel launch,从而达到执行性能优化目的。 - -### 算子选择 - -算子是深度学习框架中的基本执行单元,它们负责执行特定的计算任务,如矩阵乘法、卷积、池化等。算子选择需要综合考虑算子类型、数据类型、硬件平台和算子优化等因素,以选择最优的算子来实现深度学习任务。 - -MindSpore Ascend后端的算子类型有Aclnn kernel/Aclop kernel/Hccl kernel/Cpu kernel,算子选择流程如下图所示: - -![jit_level_kernelselect](./images/multi_level_compilation/jit_level_kernelselect.png) - -1. 算子类型:首先根据算子类型选择为计算算子还是通信算子。 -2. 硬件平台:如果硬件上有对应算子,则优先选择硬件上的算子,否则选择CPU上的异构算子,例如shape相关的计算算子可能只适合在CPU上支持,没有对应的硬件算子。 -3. 算子效率:Ascend上由于Aclnn算子较好的性能,因此计算类型算子如果有对应Aclnn kernel,则优先选择Aclnn kernel,否则就选择Aclop kernel。 -4. 如果上述3步都未选择到算子,则为不支持的算子,算子选择失败退出。 - -### 执行序编排 - -![jit_level_exec_order](./images/multi_level_compilation/jit_level_exec_order.png) - -不同图遍历算法产生的执行序在执行性能跟内存上会有较大的差异,如上图所示: - -- **BFS得到的执行序**:kernel1-> kernel2-> kernel4-> kernel5-> kernel3-> kernel6,内存峰值为5G(kernel3执行后可以把kernel1和kernel2的释放掉,则轮到kernel6执行的时候则能复用,因此kernel6 不用额外申请多的内存)。 -- **DFS得到的执行序**:kernel1-> kernel2-> kernel3-> kernel4-> kernel5-> kernel6,内存峰值为4G(kernel3执行后可以把kernel1和kernel2的释放掉,则轮到kernel4和kernel5执行的时候则能复用,因此kernel4和kernel5不用额外申请多的内存)。 - -执行序编排是在一定内存限制下求解最优算子并发的复杂性问题,不仅需要识别和利用计算图中的并发机会,以提升计算效率,还必须同时考虑多种限制条件,以确保系统的稳定性和高效性。 - -- 首先,优化模块需要解决求解最优算子并发的复杂性问题。由于计算图中的算子数量庞大且相互依赖,找到一个既能最大化并发又能保持计算图逻辑正确性的执行顺序是一个极具挑战性的任务。 -- 其次,内存限制是执行序优化中不可忽视的关键因素。增大并发虽然可以提升计算效率,但往往会显著增加峰值内存需求,从而可能导致内存溢出(OOM)错误,尤其是在资源受限的环境中。因此,优化模块必须权衡并发与内存使用之间的关系,确保在提升并发的同时,不会超出系统的内存容量。 -- MindSpore的执行序调整模块结合了基于规则和基于启发式策略的方式,提供bfs/dfs两种执行序编排算法[mindspore.jit(option={"exec_order":"bfs/dfs"})](https://www.mindspore.cn/docs/zh-CN/master/api_python/mindspore/mindspore.jit.html#mindspore.jit),以实现对计算图执行顺序的精细调整,从而在保证计算效率的同时,有效应对内存限制和系统稳定性等多重挑战。 - -## O1模式介绍 - -O1主要定位于在O0基础上实现通用、可泛化的AI编译优化,以支持大部分通用训练、推理场景的更好执行性能需求。 - -在当前阶段,O1主要支持了图算融合优化。其主要思路是:在静态图编译阶段,自动识别计算图中相邻的可融合节点,然后将其融合为更大粒度的可执行算子。通过图算融合,实现增加算子计算局部性、减少整体全局内存访存带宽开销等优化效果。通过对15+网络的实测验证,O1能够实现相比O0平均15%的性能加速。特别是对于访存密集型网络,O1优化效果更加显著。 - -### 图算融合 - -MindSpore等主流AI计算框架对用户提供的算子通常是从用户可理解、易使用角度进行定义。每个算子承载的计算量不等,计算复杂度也各不相同。但从硬件执行角度看,这种天然的、基于用户角度的算子计算量划分,并不高效,也无法充分发挥硬件资源计算能力。主要体现在: - -1. 计算量过大、过复杂的算子,通常很难生成切分较好的高性能算子,从而降低设备利用率; -2. 计算量过小的算子,由于计算无法有效隐藏数据搬移开销,也可能会造成计算的空等时延,从而降低设备利用率; -3. 硬件Device通常为多核、众核结构,当算子shape较小或其他原因引起计算并行度不够时,可能会造成部分核的空闲,从而降低设备利用率。特别是基于专用处理器架构(Domain Specific Architecture,后文简称DSA)的芯片对这些因素更为敏感。如何最大化发挥硬件算力性能的同时使算子也能具备较好的易用性,一直以来是一个很大的挑战。 - -在AI框架设计方面,目前业界主流采用图层和算子层分层的实现方法。图层负责对计算图进行融合或重组,算子层负责将融合或重组后的算子编译为高性能的可执行算子。图层通常采用基于Tensor的High-Level IR的处理和优化,算子层则采用基于计算指令的Low-Level IR进行分析和优化。 这种人为分层处理显著增加了图、算两层进行协同优化的难度。 - -MindSpore在过去几年的技术实践中,采用了图算融合的技术来较好的解决了这个问题。NLP、推荐等不同类别的典型网络在使能图算融合后训练速度都有明显收益。主要原因之一就是这些网络中存在大量小算子组合,具有较多的融合优化机会。 - -#### 图算融合架构及整体流程 - -图算融合整体架构如下图所示。在图层主要思路是把复合算子打开,然后进行跨边界聚合和优化,最后进行Kernel算子拆分。主要步骤包括: - -1. Composite Expansion:将复合算子展开为基本算子,并构成Composite子图,方便进行后续的跨边界优化和算子拆分; -2. Cross-OP Aggregation:将相邻的基本算子或Composite子图进行聚合,从而构成更大的聚合子图,方便进行后续的跨边界优化和算子拆分; -3. High-Level Optimization:在上面两步得到的聚合子图的基础上,我们可以进行大量的跨边界优化,如代数化简、公共子表达式提取(CSE)等; -4. Kernel Partition:基于计算特征以及融合算子性能,对聚合计算子图进行算子拆分。 - -优化后的计算图会以一个个子图的方式传给MindSpore AKG继续进行后端优化、生成目标代码。 - -![graphkernel](./images/graphkernel.png) - -通过以上步骤,我们可以获得两方面性能收益: - -1. 不同算子之间的跨边界性能优化收益; -2. 通过对整个计算图进行重组拆分,得到最优粒度的融合算子。 - -#### 融合算子加速优化(MindSpore AKG) - -前文提到,在HPC、深度神经网络训练等场景中,图算融合优化可带来成倍的性能提升。但随着图算融合能力的不断增强,融合算子的开发成为了继续提升图算融合能力的瓶颈点。 - -融合算子的自动生成技术可以解决基于DSA开发融合算子编程门槛较高的问题,让程序员在算子开发过程中能够聚焦于算子的实现逻辑,无需关注后端优化,极大提高其开发效率。尤其对于后端硬件架构复杂以及存在复杂算子和融合算子的场景,算子自动生成技术更加关键。 - -因此,**MindSpore AKG基于多面体编译技术(Polyhedral Model),对融合算子的加速优化与自动生成**,能够帮助MindSpore的图算融合模块优化后的融合算子在**异构硬件平台**(GPU/Ascend)上自动生成高性能的kernel,提升MindSpore的训练性能。 - -架构及整体流程如下: - -![graphkernel_akg_overview](./images/graphkernel_akg_overview.png) - -MindSpore AKG的整体框架如上图所示: - -- IR规范化 - - MindSpore AKG的输入为MindSpore图算融合模块优化后的融合子图,通过TVM的Compute / IR Builder / Hybrid 等多种描述方式对子图中的算子进行表达。然后DSL会被转换为 Halide IR([Halide](https://halide-lang.org/),是常见的用于开发高性能图像处理和Array计算的语言,可作为中间表达解耦算法和优化)并进行 IR 规范化; - - 完成初步简化和优化后,Halide IR会被转化为Poly模块所需的调度树; -- Poly模块调度优化 - - 利用Polyhedral技术中的Pluto调度算法,实现循环的自动融合、自动重排等变换,为融合算子自动生成满足并行性、数据局部性的初始调度; - - 为快速适配不同硬件后端,Poly模块内的优化pass会分为硬件无关的通用优化与硬件相关的特定优化,编译时按照硬件特征拼接组合,实现异构硬件后端的快速适配。自动切分、自动映射以及自动内存提升等pass会根据不同硬件的架构性质给出不同的优化方式; -- 后端优化 - - 为了进一步提升算子的性能,我们针对不同硬件后端开发了相应的优化pass,如Ascend后端中实现数据对齐、指令映射,GPU后端中实现向量化存取,插入同步指令等,最终生成相应平台代码。 - -### 其它图优化技术 - -除了图算融合之外,在后续版本中,O1可能会逐步扩展增加一些其它图优化技术。比如: - -1. KernelPacket:用于在动态shape场景对shape计算进行自动融合和优化; -2. 通算融合:将通信算子与计算算子进行融合。 diff --git a/docs/mindspore/source_zh_cn/features/data_engine.md b/docs/mindspore/source_zh_cn/features/data_engine.md index 19c621bd760a108f2ff0f0de349a5befe46bad78..1940e5cbe4584619ca3d722495b5e6a79b055d9a 100644 --- a/docs/mindspore/source_zh_cn/features/data_engine.md +++ b/docs/mindspore/source_zh_cn/features/data_engine.md @@ -14,7 +14,7 @@ MindSpore训练数据处理引擎核心是将训练样本(数据集)高效 - 提供了自动数据增强模式,能够基于特定策略自动对图像进行数据增强处理; - 提供单节点数据缓存能力,解决重复加载、处理数据的问题,降低数据处理开销,提升端到端训练效率。 -具体用法参考:[数据处理与加载](https://www.mindspore.cn/docs/zh-CN/master/features/dataset/overview.html) +具体用法参考:[数据处理与加载](https://www.mindspore.cn/tutorials/zh-CN/master/dataset/overview.html) ![image](./images/data/data_engine.png) diff --git a/docs/mindspore/source_zh_cn/features/index.rst b/docs/mindspore/source_zh_cn/features/index.rst index 1b046a7b6c3b98702c6c36ec7397119146289c7e..657f6d0bcaa47fdf3e45b42c2334ca6af9140757 100644 --- a/docs/mindspore/source_zh_cn/features/index.rst +++ b/docs/mindspore/source_zh_cn/features/index.rst @@ -17,6 +17,7 @@ Developer Notes runtime/memory_manager runtime/multilevel_pipeline runtime/multistream_concurrency - amp - data_engine mint + view + data_engine + amp diff --git a/docs/mindspore/source_zh_cn/features/overview.md b/docs/mindspore/source_zh_cn/features/overview.md index 414bee3a89fbfac5d54005f46decc2e72876306b..95f47265c61bbed8e6b57a60335eb3856092ec2c 100644 --- a/docs/mindspore/source_zh_cn/features/overview.md +++ b/docs/mindspore/source_zh_cn/features/overview.md @@ -43,19 +43,17 @@ MindSpore实现了函数式微分编程,对可被微分求导的函数对象 同时,基于函数式编程范式,MindSpore提供了丰富高阶函数如vmap、shard等内置高阶函数功能。与微分求导函数grad一样,可以让开发者方便的构造一个函数或对象,作为高阶函数的参数。高阶函数经过内部编译优化,生成针对开发者函数的优化版本,实现如向量化变换、分布式并行切分等特点功能。 -### [动静统一的编程体验](https://www.mindspore.cn/docs/zh-CN/master/features/program_form/overview.html) +### 编程范式(动静结合) -传统AI框架主要有两种编程执行形态,静态图模式和动态图模式。 +传统AI框架主要有两种编程执行形态,静态图模式(graph mode)和动态图模式(pynative mode)。动态图模式又称eager mode。 -静态图模式会基于开发者调用的接口,在编译时生成神经网络的图结构,然后再执行图中涉及的计算操作。 +graph mode会在编译时生成神经网络的模型计算的图结构,然后再执行计算图。 -动态图模式,能有效解决静态图的编程门槛高问题,由于程序是按照代码的编写顺序执行,不做整图编译优化,相对性能优化空间较少,特别是面向DSA等专有硬件的优化具有较大挑战。 +pynative mode,由于程序是按照代码的编写顺序执行,符合python解释执行方式,易开发和调试。因为不做图编译优化,性能优化空间较少,特别是面向DSA等专有硬件的优化具有较大挑战。 -静态图模式,能有效感知神经网络各层算子间的关系,基于编译技术进行有效的编译优化以提升性能。但传统静态图需要开发者感知构图接口,组建或调试网络比较复杂,且难于与常用Python库、自定义Python函数进行穿插使用。 +MindSpore基于Python构建神经网络的图结构,相比于传统的graph mode,能有更易用、更灵活的表达能力。MindSpore创新性的构建源码转换能力,基于Python语句提取AST进行计算图构建,因此可以支持开发者使用的Python原生语法(条件/循环等)和其他操作,如元组(Tuple)、列表(List)以及Lambda表达来构建计算图,并对计算图进行自动微分。所以MindSpore能更好地兼容动态图和静态图的编程接口,在代码层面保持一致,如控制流写法等。 -MindSpore基于Python构建神经网络的图结构,相比于传统的静态图模式,能有更易用、更灵活的表达能力。MindSpore创新性的构建源码转换能力,基于Python语句提取AST进行计算图构建,因此可以支持开发者使用的Python原生语法(条件/循环等)和其他操作,如元组(Tuple)、列表(List)以及Lambda表达来构建计算图,并对计算图进行自动微分。所以MindSpore能更好地兼容动态图和静态图的编程接口,在代码层面保持一致,如控制流写法等。 - -原生Python表达可基于Python控制流关键字,直接使能静态图模式的执行,使得动静态图的编程统一性更高。同时开发者基于MindSpore的接口,可以灵活的对Python代码片段进行动静态图模式控制。即可以将程序局部函数以静态图模式执行([mindspore.jit](https://www.mindspore.cn/docs/zh-CN/master/api_python/mindspore/mindspore.jit.html))而同时其他函数按照动态图模式执行。从而使得在与常用Python库、自定义Python函数进行穿插执行使用时,开发者可以灵活指定函数片段进行静态图优化加速,而不牺牲穿插执行的编程易用性。 +原生Python表达可基于Python控制流关键字,直接使能静态图模式的执行,使得动静态图的编程统一性更高。同时开发者基于MindSpore的接口,可以灵活的对Python代码片段进行动静态图模式控制。即可以将程序局部函数以静态图模式执行([mindspore.jit](https://www.mindspore.cn/docs/zh-CN/master/api_python/mindspore/mindspore.jit.html))而其他函数按照动态图模式执行。从而使得在与常用Python库、自定义Python函数进行穿插执行使用时,开发者可以灵活指定函数片段进行静态图优化加速,而不牺牲穿插执行的编程易用性。 ### 分布式并行 @@ -67,11 +65,11 @@ MindSpore在并行化策略搜索中引入了张量重排布技术(Tensor Redi 同时MindSpore还提供了流水线并行、优化器并行、重计算等多种并行策略供开发者使用。 -### 硬件高性能发挥 +### 编译 MindSpore基于编译技术,提供了丰富的硬件无关优化,如IR融合、代数化简、常数折叠、公共子表达式消除等。同时针对NPU、GPU等不同硬件,也提供各种硬件优化能力,从而更好的发挥硬件的大规模计算加速能力。 -#### [图算融合](https://www.mindspore.cn/docs/zh-CN/master/features/compile/multi_level_compilation.html#图算融合) +#### [多级编译架构](https://www.mindspore.cn/docs/zh-CN/master/features/compile/compilation_guide_zh.html#图算融合) MindSpore等主流AI计算框架对开发者提供的算子通常是从开发中可理解、易使用角度进行定义。每个算子承载的计算量不等,计算复杂度也各不相同。但从硬件执行角度看,这种天然的、基于用开发者角度的算子计算量划分,并不高效,也无法充分发挥硬件资源计算能力。主要体现在: @@ -101,6 +99,6 @@ MindSpore是训推一体的AI框架,同时支持训练和推理等功能。同 MindSpore按照实际执行环境和业务需求,提供多种规格的版本形态,支持部署在云端、服务器端、手机等嵌入式设备端以及耳机等超轻量级设备端上的部署执行。 -### [三方硬件接入](https://www.mindspore.cn/docs/zh-CN/master/features/runtime/pluggable_device.html) +### 三方硬件接入 MindSpore基于统一MindIR构建了开放式AI架构,支持第三方芯片插件化、标准化、低成本快速对接,可接入GPU系列芯片亦可接入各类DSA芯片。MindSpore提供Kernel模式对接和Graph模式对接两种芯片接入方式,芯片产商可根据自身特点进行接入方式选择。 diff --git a/docs/mindspore/source_zh_cn/features/view.md b/docs/mindspore/source_zh_cn/features/view.md index f9d989fcfea88fd244cb3cf071bdfa5702eadf45..535f516fe3b3a4492158b4c0daf2c80e48c9e85f 100644 --- a/docs/mindspore/source_zh_cn/features/view.md +++ b/docs/mindspore/source_zh_cn/features/view.md @@ -1,13 +1,17 @@ ## Tensor View 机制 -View操作是指创建一个新的张量,该张量与原始张量共享相同的数据存储(data storage),但具有不同的形状或排列方式。换句话说,view操作不会复制数据,而是通过不同的视角来持有现有的数据 +[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/master/docs/mindspore/source_zh_cn/features/view.md) -核心特点 -- 内存共享: View操作创建的新张量与原始张量共享底层数据存储 -- 零拷贝: 不进行数据复制,避免内存分配开销 -- 形状变换: 可以改变张量的形状而不改变数据内容 +View操作是指创建一个新的张量,该张量与原始张量共享相同的数据存储(data storage),但具有不同的形状或排列方式。换句话说,view操作不会复制数据,而是通过不同的视角来持有现有的数据。 + +核心特点: + +- 内存共享:View操作创建的新张量与原始张量共享底层数据存储。 +- 零拷贝:不进行数据复制,避免内存分配开销。 +- 形状变换:可以改变张量的形状而不改变数据内容。 + +Tensor.view()方法: -Tensor.view()方法: ```python import mindspore @@ -27,7 +31,9 @@ z = x.view(3, 2) # 改为3x2张量 print("改后:", z) print("新形状:", z.shape) ``` -Tensor.view_as()方法: + +Tensor.view_as()方法: + ```python import mindspore @@ -41,50 +47,54 @@ print("目标形状:", target.shape) print("结果形状:", y.shape) ``` -需要注意的是: +需要注意的是: -1、view操作要求张量在内存中是连续的,如果不连续,需要先调用contiguous()方法 -```python -import mindspore -from mindspore import ops +1. view操作要求张量在内存中是连续的,如果不连续,需要先调用contiguous()方法 -x = mindspore.tensor([[1, 2, 3], [4, 5, 6]]) -y = mindspore.mint.transpose(x, (1, 0)) # 创建非连续张量 + ```python + import mindspore + from mindspore import ops -# 检查连续性 -print("y是否连续:", y.is_contiguous()) + x = mindspore.tensor([[1, 2, 3], [4, 5, 6]]) + y = mindspore.mint.transpose(x, (1, 0)) # 创建非连续张量 -# 使用contiguous()确保连续性 -z = y.contiguous().view(-1) -print("z是否连续:", z.is_contiguous()) -``` + # 检查连续性 + print("y是否连续:", y.is_contiguous()) -2、view操作要求新形状的元素总数与原始张量相同 -```python -import mindspore + # 使用contiguous()确保连续性 + z = y.contiguous().view(-1) + print("z是否连续:", z.is_contiguous()) + ``` -x = mindspore.tensor([1, 2, 3, 4, 5, 6]) -print("原始张量元素数:", x.numel()) +2. view操作要求新形状的元素总数与原始张量相同 -# 正确:6 = 2 * 3 -y = x.view(2, 3) -print("改为2x3:", y) + ```python + import mindspore -# 错误:6 ≠ 2 * 4 -try: - z = x.view(2, 4) -except RuntimeError as e: - print("形状不匹配错误:", e) -``` + x = mindspore.tensor([1, 2, 3, 4, 5, 6]) + print("原始张量元素数:", x.numel()) + + # 正确:6 = 2 * 3 + y = x.view(2, 3) + print("改为2x3:", y) + + # 错误:6 ≠ 2 * 4 + try: + z = x.view(2, 4) + except RuntimeError as e: + print("形状不匹配错误:", e) + ``` + +view与reshape区别: + +- view操作: -view与reshape区别: + - 严格要求连续性: View操作要求张量在内存中必须是连续的。 + - 失败机制: 如果张量不连续,view操作会抛出错误。 + - 解决方案: 需要先调用contiguous()方法。 -view操作 - - 严格要求连续性: View操作要求张量在内存中必须是连续的 - - 失败机制: 如果张量不连续,view操作会抛出错误 - - 解决方案: 需要先调用contiguous()方法 +- reshape操作: -reshape操作 - - 灵活处理: reshape操作更灵活,不要求张量必须连续 - - 自动处理: 如果张量不连续,reshape会自动创建新拷贝 - - 始终成功: 只要形状匹配,reshape操作总是能成功 \ No newline at end of file + - 灵活处理: reshape操作更灵活,不要求张量必须连续。 + - 自动处理: 如果张量不连续,reshape会自动创建新拷贝。 + - 始终成功: 只要形状匹配,reshape操作总是能成功。 \ No newline at end of file diff --git a/docs/mindspore/source_zh_cn/note/api_mapping/pytorch_api_mapping.md b/docs/mindspore/source_zh_cn/note/api_mapping/pytorch_api_mapping.md index 2af08cd1efa96e52e1183d3f9befd3676802c175..c800555280eb147bb2a7ec00da75dfaff4b31d96 100644 --- a/docs/mindspore/source_zh_cn/note/api_mapping/pytorch_api_mapping.md +++ b/docs/mindspore/source_zh_cn/note/api_mapping/pytorch_api_mapping.md @@ -292,9 +292,9 @@ mindspore.mint.argmax只有一种API形式,即mindspore.mint.argmax(input, dim | [torch.nn.AvgPool2d](https://PyTorch.org/docs/2.1/generated/torch.nn.AvgPool2d.html) | [mindspore.mint.nn.AvgPool2d](https://www.mindspore.cn/docs/zh-CN/master/api_python/mint/mindspore.mint.nn.AvgPool2d.html) | [一致](https://www.mindspore.cn/docs/zh-CN/master/note/api_mapping/pytorch_api_mapping.html#api映射一致标准及例外场景) | | [torch.nn.BCELoss](https://PyTorch.org/docs/2.1/generated/torch.nn.BCELoss.html) | [mindspore.mint.nn.BCELoss](https://www.mindspore.cn/docs/zh-CN/master/api_python/mint/mindspore.mint.nn.BCELoss.html) | [一致](https://www.mindspore.cn/docs/zh-CN/master/note/api_mapping/pytorch_api_mapping.html#api映射一致标准及例外场景) | | [torch.nn.BCEWithLogitsLoss](https://pytorch.org/docs/2.1/generated/torch.nn.BCEWithLogitsLoss.html) | [mindspore.mint.nn.BCEWithLogitsLoss](https://www.mindspore.cn/docs/zh-CN/master/api_python/mint/mindspore.mint.nn.BCEWithLogitsLoss.html) | [一致](https://www.mindspore.cn/docs/zh-CN/master/note/api_mapping/pytorch_api_mapping.html#api映射一致标准及例外场景) | -| [torch.nn.BatchNorm1d](https://PyTorch.org/docs/2.1/generated/torch.nn.BatchNorm1d.html) | [mindspore.mint.nn.BatchNorm1d](https://www.mindspore.cn/docs/zh-CN/master/api_python/mint/mindspore.mint.nn.BatchNorm1d.html) | [一致](https://www.mindspore.cn/docs/zh-CN/master/note/api_mapping/pytorch_api_mapping.html#api映射一致标准及例外场景)| -| [torch.nn.BatchNorm2d](https://PyTorch.org/docs/2.1/generated/torch.nn.BatchNorm2d.html) | [mindspore.mint.nn.BatchNorm2d](https://www.mindspore.cn/docs/zh-CN/master/api_python/mint/mindspore.mint.nn.BatchNorm2d.html) | [一致](https://www.mindspore.cn/docs/zh-CN/master/note/api_mapping/pytorch_api_mapping.html#api映射一致标准及例外场景)| -| [torch.nn.BatchNorm3d](https://PyTorch.org/docs/2.1/generated/torch.nn.BatchNorm3d.html) | [mindspore.mint.nn.BatchNorm3d](https://www.mindspore.cn/docs/zh-CN/master/api_python/mint/mindspore.mint.nn.BatchNorm3d.html) | [一致](https://www.mindspore.cn/docs/zh-CN/master/note/api_mapping/pytorch_api_mapping.html#api映射一致标准及例外场景)| +| [torch.nn.BatchNorm1d](https://PyTorch.org/docs/2.1/generated/torch.nn.BatchNorm1d.html) | [mindspore.mint.nn.BatchNorm1d](https://www.mindspore.cn/docs/zh-CN/master/api_python/mint/mindspore.mint.nn.BatchNorm1d.html) | 功能一致,MindSpore默认为推理模式 | +| [torch.nn.BatchNorm2d](https://PyTorch.org/docs/2.1/generated/torch.nn.BatchNorm2d.html) | [mindspore.mint.nn.BatchNorm2d](https://www.mindspore.cn/docs/zh-CN/master/api_python/mint/mindspore.mint.nn.BatchNorm2d.html) | 功能一致,MindSpore默认为推理模式 | +| [torch.nn.BatchNorm3d](https://PyTorch.org/docs/2.1/generated/torch.nn.BatchNorm3d.html) | [mindspore.mint.nn.BatchNorm3d](https://www.mindspore.cn/docs/zh-CN/master/api_python/mint/mindspore.mint.nn.BatchNorm3d.html) | 功能一致,MindSpore默认为推理模式 | | [torch.nn.ConstantPad1d](https://pytorch.org/docs/2.1/generated/torch.nn.ConstantPad1d.html) | [mindspore.mint.nn.ConstantPad1d](https://www.mindspore.cn/docs/zh-CN/master/api_python/mint/mindspore.mint.nn.ConstantPad1d.html) | [一致](https://www.mindspore.cn/docs/zh-CN/master/note/api_mapping/pytorch_api_mapping.html#api映射一致标准及例外场景) | | [torch.nn.ConstantPad2d](https://pytorch.org/docs/2.1/generated/torch.nn.ConstantPad2d.html) | [mindspore.mint.nn.ConstantPad2d](https://www.mindspore.cn/docs/zh-CN/master/api_python/mint/mindspore.mint.nn.ConstantPad2d.html) | [一致](https://www.mindspore.cn/docs/zh-CN/master/note/api_mapping/pytorch_api_mapping.html#api映射一致标准及例外场景) | | [torch.nn.ConstantPad3d](https://pytorch.org/docs/2.1/generated/torch.nn.ConstantPad3d.html) | [mindspore.mint.nn.ConstantPad3d](https://www.mindspore.cn/docs/zh-CN/master/api_python/mint/mindspore.mint.nn.ConstantPad3d.html) | [一致](https://www.mindspore.cn/docs/zh-CN/master/note/api_mapping/pytorch_api_mapping.html#api映射一致标准及例外场景) | @@ -303,7 +303,7 @@ mindspore.mint.argmax只有一种API形式,即mindspore.mint.argmax(input, dim | [torch.nn.ConvTranspose2d](https://pytorch.org/docs/2.1/generated/torch.nn.ConvTranspose2d.html) | [mindspore.mint.nn.ConvTranspose2d](https://www.mindspore.cn/docs/zh-CN/master/api_python/mint/mindspore.mint.nn.ConvTranspose2d.html) | [一致](https://www.mindspore.cn/docs/zh-CN/master/note/api_mapping/pytorch_api_mapping.html#api映射一致标准及例外场景) | | [torch.nn.CrossEntropyLoss](https://pytorch.org/docs/2.1/generated/torch.nn.CrossEntropyLoss.html) | [mindspore.mint.nn.CrossEntropyLoss](https://www.mindspore.cn/docs/zh-CN/master/api_python/mint/mindspore.mint.nn.CrossEntropyLoss.html) | [一致](https://www.mindspore.cn/docs/zh-CN/master/note/api_mapping/pytorch_api_mapping.html#api映射一致标准及例外场景) | | [torch.nn.Dropout](https://pytorch.org/docs/2.1/generated/torch.nn.Dropout.html) | [mindspore.mint.nn.Dropout](https://www.mindspore.cn/docs/zh-CN/master/api_python/mint/mindspore.mint.nn.Dropout.html) | 功能一致,MindSpore默认为推理模式 | -| [torch.nn.Dropout2d](https://pytorch.org/docs/2.1/generated/torch.nn.Dropout2d.html) | [mindspore.mint.nn.Dropout2d](https://www.mindspore.cn/docs/zh-CN/master/api_python/mint/mindspore.mint.nn.Dropout2d.html) | 功能一致,MindSpore不含参数inplace,MindSpore默认为推理模式 | +| [torch.nn.Dropout2d](https://pytorch.org/docs/2.1/generated/torch.nn.Dropout2d.html) | [mindspore.mint.nn.Dropout2d](https://www.mindspore.cn/docs/zh-CN/master/api_python/mint/mindspore.mint.nn.Dropout2d.html) | 功能一致,MindSpore默认为推理模式 | | [torch.nn.ELU](https://pytorch.org/docs/2.1/generated/torch.nn.ELU.html) | [mindspore.mint.nn.ELU](https://www.mindspore.cn/docs/zh-CN/master/api_python/mint/mindspore.mint.nn.ELU.html) | 功能一致,MindSpore不含参数inplace| | [torch.nn.Embedding](https://pytorch.org/docs/2.1/generated/torch.nn.Embedding.html) | [mindspore.mint.nn.Embedding](https://www.mindspore.cn/docs/zh-CN/master/api_python/mint/mindspore.mint.nn.Embedding.html) | [一致](https://www.mindspore.cn/docs/zh-CN/master/note/api_mapping/pytorch_api_mapping.html#api映射一致标准及例外场景) | | [torch.nn.Flatten](https://pytorch.org/docs/2.1/generated/torch.nn.Flatten.html) | [mindspore.mint.flatten](https://www.mindspore.cn/docs/zh-CN/master/api_python/mint/mindspore.mint.flatten.html) | [一致](https://www.mindspore.cn/docs/zh-CN/master/note/api_mapping/pytorch_api_mapping.html#api映射一致标准及例外场景) | @@ -337,7 +337,7 @@ mindspore.mint.argmax只有一种API形式,即mindspore.mint.argmax(input, dim | [torch.nn.SmoothL1Loss](https://pytorch.org/docs/2.1/generated/torch.nn.SmoothL1Loss.html) | [mindspore.mint.nn.SmoothL1Loss](https://www.mindspore.cn/docs/zh-CN/master/api_python/mint/mindspore.mint.nn.SmoothL1Loss.html) | [一致](https://www.mindspore.cn/docs/zh-CN/master/note/api_mapping/pytorch_api_mapping.html#api映射一致标准及例外场景) | | [torch.nn.Softmax](https://pytorch.org/docs/2.1/generated/torch.nn.Softmax.html) | [mindspore.mint.nn.Softmax](https://www.mindspore.cn/docs/zh-CN/master/api_python/mint/mindspore.mint.nn.Softmax.html) | [一致](https://www.mindspore.cn/docs/zh-CN/master/note/api_mapping/pytorch_api_mapping.html#api映射一致标准及例外场景)| | [torch.nn.Softshrink](https://pytorch.org/docs/2.1/generated/torch.nn.Softshrink.html) | [mindspore.mint.nn.Softshrink](https://www.mindspore.cn/docs/zh-CN/master/api_python/mint/mindspore.mint.nn.Softshrink.html) | [一致](https://www.mindspore.cn/docs/zh-CN/master/note/api_mapping/pytorch_api_mapping.html#api映射一致标准及例外场景)| -| [torch.nn.SyncBatchNorm](https://pytorch.org/docs/2.1/generated/torch.nn.SyncBatchNorm.html) | [mindspore.mint.nn.SyncBatchNorm](https://www.mindspore.cn/docs/zh-CN/master/api_python/mint/mindspore.mint.nn.SyncBatchNorm.html) | [一致](https://www.mindspore.cn/docs/zh-CN/master/note/api_mapping/pytorch_api_mapping.html#api映射一致标准及例外场景) | +| [torch.nn.SyncBatchNorm](https://pytorch.org/docs/2.1/generated/torch.nn.SyncBatchNorm.html) | [mindspore.mint.nn.SyncBatchNorm](https://www.mindspore.cn/docs/zh-CN/master/api_python/mint/mindspore.mint.nn.SyncBatchNorm.html) | 功能一致,MindSpore默认为推理模式 | | [torch.nn.Tanh](https://pytorch.org/docs/2.1/generated/torch.nn.Tanh.html) | [mindspore.mint.nn.Tanh](https://www.mindspore.cn/docs/zh-CN/master/api_python/mint/mindspore.mint.nn.Tanh.html) | [一致](https://www.mindspore.cn/docs/zh-CN/master/note/api_mapping/pytorch_api_mapping.html#api映射一致标准及例外场景) | | [torch.nn.Unfold](https://pytorch.org/docs/2.1/generated/torch.nn.Unfold.html) | [mindspore.mint.nn.Unfold](https://www.mindspore.cn/docs/zh-CN/master/api_python/mint/mindspore.mint.nn.Unfold.html) | [一致](https://www.mindspore.cn/docs/zh-CN/master/note/api_mapping/pytorch_api_mapping.html#api映射一致标准及例外场景)| | [torch.nn.Upsample](https://pytorch.org/docs/2.1/generated/torch.nn.Upsample.html) | [mindspore.mint.nn.Upsample](https://www.mindspore.cn/docs/zh-CN/master/api_python/mint/mindspore.mint.nn.Upsample.html) | [一致](https://www.mindspore.cn/docs/zh-CN/master/note/api_mapping/pytorch_api_mapping.html#api映射一致标准及例外场景) | diff --git a/docs/mindstudio/docs/source_zh_cn/version/mindstudio_insight.md b/docs/mindstudio/docs/source_zh_cn/version/mindstudio_insight.md index aa064659f09934c08d3aa91068597efb81ef6204..4f0c94cf478b44a06c1d7a70db02749611551348 100644 --- a/docs/mindstudio/docs/source_zh_cn/version/mindstudio_insight.md +++ b/docs/mindstudio/docs/source_zh_cn/version/mindstudio_insight.md @@ -13,6 +13,6 @@ MindStudio Insight可视化工具,需要与采集性能数据时使用的MindS | MindStudio Insight | MindSpore | |:----------------------------------------------------:|:-------------------------------------------:| -| [8.0.RC1](https://www.hiascend.com/developer/download/community/result?module=sto+cann&sto=8.0.RC1&cann=8.1.RC1.beta1) | [2.6.0](https://www.mindspore.cn/versions#2.6.0) | +| [8.0.RC1](https://www.hiascend.com/developer/download/community/result?module=sto+cann&sto=8.0.RC1&cann=8.1.RC1) | [2.6.0](https://www.mindspore.cn/versions#2.6.0) | | [7.0.RC3](https://www.hiascend.com/developer/download/community/result?module=sto+cann&sto=7.0.RC3&cann=8.0.RC3.beta1) | [2.5.0](https://www.mindspore.cn/versions#2.5.0) | | [7.0.RC3](https://www.hiascend.com/developer/download/community/result?module=sto+cann&sto=7.0.RC3&cann=8.0.RC3.beta1) | [2.4.10](https://www.mindspore.cn/versions#2.4.10) | \ No newline at end of file diff --git a/resource/release/release_list_en.md b/resource/release/release_list_en.md index 66701ae9d5cd75fcf2830a160b4899da9c11afcf..48dc66a0f7ac6dd4177dad854e6a7c08adbc5fee 100644 --- a/resource/release/release_list_en.md +++ b/resource/release/release_list_en.md @@ -110,7 +110,7 @@ | Commercial edition Installation Guide | Community edition download link (refer to commercial edition for instructions) | |--------|------------------| -| [Ascend Training Solution 25.0.RC1](https://support.huawei.com/enterprise/zh/doc/EDOC1100472026) | [CANN 8.1.RC1.beta1](https://www.hiascend.com/developer/download/community/result?module=cann)
[firmware and driver](https://www.hiascend.com/hardware/firmware-drivers/community) | +| [Ascend Training Solution 25.0.RC1](https://support.huawei.com/enterprise/zh/doc/EDOC1100472026) | [CANN 8.1.RC1](https://www.hiascend.com/developer/download/community/result?module=cann)
[firmware and driver](https://www.hiascend.com/hardware/firmware-drivers/community) | **Related Documents** @@ -144,7 +144,7 @@ | Commercial edition Installation Guide | Community edition download link (refer to commercial edition for instructions) | |--------|------------------| -| [Ascend Training Solution 25.0.RC1](https://support.huawei.com/enterprise/zh/doc/EDOC1100472026) | [CANN 8.1.RC1.beta1](https://www.hiascend.com/developer/download/community/result?module=cann)
[firmware and driver](https://www.hiascend.com/hardware/firmware-drivers/community) | +| [Ascend Training Solution 25.0.RC1](https://support.huawei.com/enterprise/zh/doc/EDOC1100472026) | [CANN 8.1.RC1](https://www.hiascend.com/developer/download/community/result?module=cann)
[firmware and driver](https://www.hiascend.com/hardware/firmware-drivers/community) | **Related Documents** diff --git a/resource/release/release_list_zh_cn.md b/resource/release/release_list_zh_cn.md index 3c6c123301731b67d6fa2ec1fb75bdda8baf873d..94b37b9abb8541e2b50b599abfc02e0219887799 100644 --- a/resource/release/release_list_zh_cn.md +++ b/resource/release/release_list_zh_cn.md @@ -112,7 +112,7 @@ | 商用版安装指引文档 | 社区版下载地址(安装参考商用版) | |--------|------------------| -| [Ascend Training Solution 25.0.RC1](https://support.huawei.com/enterprise/zh/doc/EDOC1100472026) | [CANN 8.1.RC1.beta1](https://www.hiascend.com/developer/download/community/result?module=cann)
[固件与驱动](https://www.hiascend.com/hardware/firmware-drivers/community) | +| [Ascend Training Solution 25.0.RC1](https://support.huawei.com/enterprise/zh/doc/EDOC1100472026) | [CANN 8.1.RC1](https://www.hiascend.com/developer/download/community/result?module=cann)
[固件与驱动](https://www.hiascend.com/hardware/firmware-drivers/community) | **配套资料** @@ -146,7 +146,7 @@ | 商用版安装指引文档 | 社区版下载地址(安装参考商用版) | |--------|------------------| -| [Ascend Training Solution 25.0.RC1](https://support.huawei.com/enterprise/zh/doc/EDOC1100472026) | [CANN 8.1.RC1.beta1](https://www.hiascend.com/developer/download/community/result?module=cann)
[固件与驱动](https://www.hiascend.com/hardware/firmware-drivers/community) | +| [Ascend Training Solution 25.0.RC1](https://support.huawei.com/enterprise/zh/doc/EDOC1100472026) | [CANN 8.1.RC1](https://www.hiascend.com/developer/download/community/result?module=cann)
[固件与驱动](https://www.hiascend.com/hardware/firmware-drivers/community) | **配套资料** diff --git a/tutorials/source_en/beginner/tensor.md b/tutorials/source_en/beginner/tensor.md index eebcc25910c5391d04faa8a0b9568b98e011be1d..f3ce742a49dc86b71fdecd409d88b0700df1e9f1 100644 --- a/tutorials/source_en/beginner/tensor.md +++ b/tutorials/source_en/beginner/tensor.md @@ -11,7 +11,6 @@ A tensor is a special data structure that is similar to arrays and matrices. [Te ```python import numpy as np import mindspore -from mindspore import ops from mindspore import Tensor ``` @@ -25,7 +24,7 @@ There are multiple methods for creating tensors. When building a tensor, you can ```python data = [1, 0, 1, 0] - x_data = Tensor(data) + x_data = mindspore.tensor(data) print(x_data, x_data.shape, x_data.dtype) ``` @@ -39,12 +38,12 @@ There are multiple methods for creating tensors. When building a tensor, you can ```python np_array = np.array(data) - x_np = Tensor(np_array) - print(x_np, x_np.shape, x_np.dtype) + x_from_np = mindspore.tensor(np_array) + print(x_from_np, x_from_np.shape) ``` ```text - [1 0 1 0] (4,) Int64 + [1 0 1 0] (4,) ``` - **Generating a tensor by using init** @@ -59,9 +58,9 @@ There are multiple methods for creating tensors. When building a tensor, you can from mindspore.common.initializer import One, Normal # Initialize a tensor with ones - tensor1 = mindspore.Tensor(shape=(2, 2), dtype=mindspore.float32, init=One()) + tensor1 = mindspore.tensor(shape=(2, 2), dtype=mindspore.float32, init=One()) # Initialize a tensor from normal distribution - tensor2 = mindspore.Tensor(shape=(2, 2), dtype=mindspore.float32, init=Normal()) + tensor2 = mindspore.tensor(shape=(2, 2), dtype=mindspore.float32, init=Normal()) print("tensor1:\n", tensor1) print("tensor2:\n", tensor2) @@ -72,8 +71,8 @@ There are multiple methods for creating tensors. When building a tensor, you can [[1. 1.] [1. 1.]] tensor2: - [[-0.00063482 -0.00916224] - [ 0.01324238 -0.0171206 ]] + [[-0.0107513 0.00407822] + [-0.00113699 0.00081491]] ``` The `init` is used for delayed initialization in parallel mode. Usually, it is not recommended to use `init` interface to initialize parameters. @@ -117,7 +116,7 @@ Tensor attributes include shape, data type, transposed tensor, item size, number - strides: the number of bytes to traverse in each dimension of `Tensor`, which is a tuple. ```python -x = Tensor(np.array([[1, 2], [3, 4]]), mindspore.int32) +x = mindspore.tensor(np.array([[1, 2], [3, 4]]), mindspore.int32) print("x_shape:", x.shape) print("x_dtype:", x.dtype) @@ -143,7 +142,7 @@ x_strides: (8, 4) Tensor indexing is similar to NumPy indexing. Indexing starts from 0, negative indexing means indexing in reverse order, and colons `:` and `...` are used for slicing. ```python -tensor = Tensor(np.array([[0, 1], [2, 3]]).astype(np.float32)) +tensor = mindspore.tensor(np.array([[0, 1], [2, 3]]).astype(np.float32)) print("First row: {}".format(tensor[0])) print("value of bottom right corner: {}".format(tensor[1, 1])) @@ -165,8 +164,8 @@ There are many operations between tensors, including arithmetic, linear algebra, > Common arithmetic operations include: addition (+), subtraction (-), multiplication (\*), division (/), modulo (%), and exact division (//). ```python -x = Tensor(np.array([1, 2, 3]), mindspore.float32) -y = Tensor(np.array([4, 5, 6]), mindspore.float32) +x = mindspore.tensor(np.array([1, 2, 3]), mindspore.float32) +y = mindspore.tensor(np.array([4, 5, 6]), mindspore.float32) output_add = x + y output_sub = x - y @@ -195,8 +194,8 @@ floordiv: [4. 2. 2.] [concat](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.concat.html) connects a series of tensors in a given dimension. ```python -data1 = Tensor(np.array([[0, 1], [2, 3]]).astype(np.float32)) -data2 = Tensor(np.array([[4, 5], [6, 7]]).astype(np.float32)) +data1 = mindspore.tensor(np.array([[0, 1], [2, 3]]).astype(np.float32)) +data2 = mindspore.tensor(np.array([[4, 5], [6, 7]]).astype(np.float32)) output = ops.concat((data1, data2), axis=0) print(output) @@ -215,8 +214,8 @@ shape: [stack](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.stack.html) combines two tensors from another dimension. ```python -data1 = Tensor(np.array([[0, 1], [2, 3]]).astype(np.float32)) -data2 = Tensor(np.array([[4, 5], [6, 7]]).astype(np.float32)) +data1 = mindspore.tensor(np.array([[0, 1], [2, 3]]).astype(np.float32)) +data2 = mindspore.tensor(np.array([[4, 5], [6, 7]]).astype(np.float32)) output = ops.stack([data1, data2]) print(output) @@ -242,7 +241,7 @@ Tensor and NumPy can be converted to each other. Use [Tensor.asnumpy()](https://www.mindspore.cn/docs/en/master/api_python/mindspore/Tensor/mindspore.Tensor.asnumpy.html) to convert Tensor to NumPy, which is same as tensor building. ```python -t = Tensor([1., 1., 1., 1., 1.]) +t = mindspore.tensor([1., 1., 1., 1., 1.]) print(f"t: {t}", type(t)) n = t.asnumpy() print(f"n: {n}", type(n)) @@ -255,7 +254,7 @@ n: [1. 1. 1. 1. 1.] ### NumPy to Tensor -Use `Tensor()` to convert NumPy to Tensor. +Use [Tensor.from_numpy()](https://www.mindspore.cn/docs/en/master/api_python/mindspore/Tensor/mindspore.Tensor.asnumpy.html) to convert NumPy to Tensor, which operates via memory sharing (zero-copy) for better performance, with the constraint that input NumPy arrays must be memory-contiguous (verifiable with numpy.iscontiguous()). ```python n = np.ones(5) @@ -271,4 +270,22 @@ print(f"t: {t}", type(t)) ```text n: [2. 2. 2. 2. 2.] t: [2. 2. 2. 2. 2.] +``` + +Use [mindspore.tensor()](https://www.mindspore.cn/docs/en/master/api_python/mindspore/mindspore.tensor.html) for direct creation results in data copying. + +```python +n = np.ones(5) +t = mindspore.tensor(n) +``` + +```python +np.add(n, 1, out=n) +print(f"n: {n}", type(n)) +print(f"t: {t}", type(t)) +``` + +```text +n: [2. 2. 2. 2. 2.] +t: [1. 1. 1. 1. 1.] ``` \ No newline at end of file diff --git a/tutorials/source_en/debug/profiler.md b/tutorials/source_en/debug/profiler.md index 0d1948fb090a1fafd55a37484213465785571d63..47013bf20bf52e7f53da10a2093ea5a2304f18f2 100644 --- a/tutorials/source_en/debug/profiler.md +++ b/tutorials/source_en/debug/profiler.md @@ -22,7 +22,7 @@ There are five ways to collect training performance data, and the following desc ### Method 1: mindspore.Profiler Interface Enabling -Add the MindSpore Profiler related interfaces in the training script, users can refer to [MindSpore Profiler parameter details](https://www.mindspore.cn/docs/en/master/api_python/mindspore/mindspore.Profiler.html) and [_ExperimentalConfig Parameter Details](https://www.mindspore.cn/docs/en/master/api_python/mindspore/mindspore.profiler._ExperimentalConfig.html) to configure parameters such as profiler_level according to their data requirements. +Add the MindSpore Profiler related interfaces in the training script, users can refer to [MindSpore Profiler parameter details](https://www.mindspore.cn/docs/en/master/api_python/mindspore/mindspore.Profiler.html) and [_ExperimentalConfig Parameter Details](https://www.mindspore.cn/docs/en/master/api_python/mindspore/mindspore.profiler._ExperimentalConfig.html) to configure parameters such as profiler_level according to their data requirements. The interface supports two collection modes: CallBack mode and custom for loop mode, and supports both Graph and PyNative modes. diff --git a/tutorials/source_en/model_infer/ms_infer/images/embedding2.png b/tutorials/source_en/model_infer/ms_infer/images/embedding2.png new file mode 100644 index 0000000000000000000000000000000000000000..aa172f4311f5dd8eb64a51f7501258649d1b9717 Binary files /dev/null and b/tutorials/source_en/model_infer/ms_infer/images/embedding2.png differ diff --git a/tutorials/source_en/model_infer/ms_infer/ms_infer_model_infer.rst b/tutorials/source_en/model_infer/ms_infer/ms_infer_model_infer.rst new file mode 100644 index 0000000000000000000000000000000000000000..bb0f7e38890642b609a8786a6e5a1188ed4be7c8 --- /dev/null +++ b/tutorials/source_en/model_infer/ms_infer/ms_infer_model_infer.rst @@ -0,0 +1,437 @@ +MindSpore LLM Inference with Framework +========================================== + +.. image:: https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source_en.svg + :target: https://gitee.com/mindspore/docs/blob/master/tutorials/source_en/parallel/optimize_technique.rst + :alt: View Source On Gitee + +.. toctree:: + :maxdepth: 1 + :hidden: + + ms_infer_network_develop + ms_infer_parallel_infer + ms_infer_quantization + ms_infer_model_serving_infer + +Background +------------ + +At the end of 2022, with the release of OpenAI's ChatGPT, a new research direction emerged in the AI domain, that is, LLMs based on the Transformers structure. These LLMs exhibited capabilities beyond expectations and achieved impressive results in various tests, quickly becoming the research focus of AI. + +One significant research direction in LLMs is improving their cost-effectiveness in practical applications. + +- An LLM usually has tens of billions of parameters. In this case, the computation workload for a single model inference process is extremely high and requires massive compute resources. As a result, AI service providers find that the cost of an LLM inference is very high and cannot be effectively applied to real-world scenarios. + +- To address the high costs of LLM inference, the MindSpore framework offers inference capabilities. Based on the characteristics of mainstream LLMs, MindSpore has deeply optimized the LLM deployment and inference processes, achieving optimal cost efficiency in model inference. + +Model Principles +------------------------ + +Before learning about the inference capability of MindSpore, first explore how current mainstream LLMs achieve such amazing intelligence. We will take the most common text generation models as examples to briefly introduce the inference principles of LLMs, and see how AI models perform complex tasks such as conversation and summarizing main ideas through computation. + +Similar to a common model, the construction of an LLM consists of two phases: training and inference. + +- **Training**: The training process of an LLM can be simply understood as that a model continuously reading and learning from massive text data. During this process, the model records the position relationship and occurrence frequency of each text element in the model weight. For example, there is a high probability that "9.6 million square kilometers" will appear after the sentence "China has an area of". During the training process, the LLM records that the two sentences are strongly associated through massive data input. + +- **Inference**: The LLM inference process is to find the most relevant subsequent text elements from the training database based on a specific piece of text provided. For example, if you ask "China has an area of", the LLM can return "9.6 million square kilometers" based on the information recorded during training, providing you with your desired answer. + +In actual text processing scenarios, languages are complex and changing. Therefore, it is difficult to identify the direct correlation between two sentences. LLM technologies usually use the tokenization method, that is, breaking down "China has an area of" into multiple common words such as "China", "has", "an", "area", and "of". This method can better cope with the impact of text differences. For example, the similarity between the phrases "the area of China is" and "China has an area of" is nearly 0, while the similarity between ["the", "area", "of", "China", "is"] and ["China", "has", "an", "area", "of"] can be considered as 60%, which can effectively helps the LLM identify such text differences. This technique, known as tokenization, breaks a piece of text into a combination of tokens (usually words and punctuation). The process of generating a sentence is as follows: The LLM infers the next token based on the current token combination, combines the next token with the previous tokens to form a new input, and gradually completes the generation of the entire text through repeated training step. The following table briefly describes an example of LLM inference. + +Input: Capital of China + +.. list-table:: Inference example + :header-rows: 1 + + * - Inference iteration + - Inference input + - Input vector + - Inference result + * - 1 + - China's capital + - [China, 's, capital] + - Beijing + * - 2 + - China's capital, Beijing + - [China, 's, capital, Beijing] + - is + * - 3 + - China's capital, Beijing, is + - [China, 's, capital, Beijing, is] + - Beautiful + * - 4 + - China's capital, Beijing, is beautiful. + - [China, 's, capital, Beijing, is, beautiful] + - END + +In each step of training, the LLM infers the next token based on the current context and combines the token with the previous statement to form the input of the next step of training. After multiple steps of training, if the special token "END" is generated, the model considers that the inference ends, and returns the result. + +Procedure +---------------- + +MindSpore LLM inference provides you with an "out-of-the-box" deployment and inference capability. You can use the LLM APIs provided by MindSpore to quickly deploy your own LLMs and optimize them based on model features, achieving the optimal cost-effectiveness and bringing LLM capabilities to practical applications. The following figure shows the key steps of model inference using the MindSpore LLM inference feature. + +.. figure:: ./images/llm_infer_flow.png + :alt: llm-infer-flow + +1. **Weight preparation**: The weight data is the intelligent core of an LLM, and therefore the first step of deploying a model is to obtain and prepare the corresponding weight files. +2. **Model loading**: During inference, the model structure may differ based on the optimization techniques used. Therefore, the backbone network of the model needs to be constructed based on the model network structure to facilitate subsequent inference. +3. **Status determination**: Based on the specific semantics of the inference request, the model determines whether to continue with inference. This process is mainly used to determine whether to end multi-step inference. If inference ends (for example, after answering a question), the results are returned; otherwise, the next step of inference continues. +4. **Inference preprocessing**: The inference data is preprocessed according to the inference request. Common preprocessing steps include using a tokenizer to convert the statement into a group of digital vectors represented by indexes, allowing the LLM to accurately recognize the task content, and constructing some special input of model inference for acceleration (for example, cache information of incremental inference of KVCache). +5. **Model inference**: The model performs inference based on the input data, typically returning the probability distribution of the next token in the sentence. +6. **Inference postprocessing**: Based on the results of the model inference, the next token is computed and converted back into text. If inference does not end, the token is assembled into the input for the next step of inference to continue the process. + +Main Features +---------------- + +To achieve the optimal cost-effectiveness, MindSpore LLM has undergone multiple in-depth optimizations tailored to the characteristics of LLM networks. The main features include: + +- **Full and incremental inference**: The core network structure of LLMs primarily utilizes a transformer-based self-attention mechanism, where attention scores of all tokens are computed in each training step. However, the attention scores of the same token sequence yield the same key and value (KV) results. For example, the KV of ["the", "area", "of", "China", "is"] may be understood as a combination of ["the", "area", "of", "China"] and ["is"]. Therefore, by caching the keys and values of previously computed sequences, the computation workload for the next training step can be reduced. This technique is commonly known as KVCache optimization. In two consecutive training steps, *N* and *N* +1, the KVs from training step *N* can be fully reused in training step *N* +1 because the first *N* sequences are identical and only the first token of *N* +1 steps needs to be computed. In this way, the model inference can be divided into the following two phases: + + - **Full inference**: This is the first training step initiated by your input, where the length *N* of the input statement and the content is unpredictable. All keys and values must be computed, which is called a full inference. + + - **Incremental inference**: After completing the first training step, the keys and values from the previous statement are stored in the KVCache. In this case, only the KV corresponding to the latest token need to be computed, which are then combined with the cached result to compute the attention score, constituting an incremental inference. + +- **Attention optimization**: The primary computation in the LLM's network involves the computation of attention. Since the attention size in mainstream models is often large (typically 4096 x 4096 or more), the performance of the entire inference process heavily relies on the efficiency of attention computation. Many studies focus on optimizing the performance of attention computation, with notable techniques such as flash attention and page attention. + + - **Flash attention**: During attention computation, two large matrices (4096 x 4096) are multiplied. This computation breaks the large matrix into smaller matrices that can be processed on multiple chips. Subject to the minimum cache size of chips, data must continuously be moved between the cache and main memory. As a result, compute resources cannot be fully used. Consequently, attention computation is often bandwidth-bound. Flash attention addresses this by dividing attention into blocks, allowing each block to be computed independently on a chip, avoiding multiple data movements during the computation of KVs and enhancing attention computation performance. For details, see `Flash Attention `_. + + - **Page attention graphics memory optimization**: Standard flash attention reads and saves the entire input KV data each time. This method is simple but wastes many resources. For example, "China's capital" and "China's national flag" share "China's", leading to identical KVs for their attention. Standard flash attention needs to store two copies of KVs, wasting the graphics memory. Page attention optimizes KVCache based on the page table principle of the Linux OS. It stores KVs in blocks of a specific size. In the preceding example, "China", "'s", "capital", and "national flag" are stored as four pieces of KV data. Compared with the original six pieces of data, this method effectively saves graphics memory resources. In the service-oriented scenario, more idle graphics memory allows for a larger batch size for model inference, thereby achieving higher throughput. For details, see `Page Attention `_. + +- **Model quantization**: MindSpore LLM inference supports quantization to reduce the model size. It provides technologies such as A16W8, A16W4, A8W8, and KVCache quantizations to reduce model resource usage and improve the inference throughput. + +Inference Tutorial +------------------------ + +Based on the mainstream Qwen2 open-source LLM, this section demonstrates how to use the inference capability of the MindSpore model to build an example of end-to-end text generation. + +.. note:: + + The Qwen2 model has multiple versions and configurations. This document uses Qwen2-7B-Instruct as an example. + +Environment Preparations +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +MindSpore LLM inference with the framework mainly depends on the MindSpore open-source software. Before using the framework, you need to install the MindSpore Python package. You are advised to use the conda virtual environment. You can run the following commands for installation: + +.. code:: shell + + export PYTHON_ENV_NAME=mindspore-infer-py311 + conda create -n ${PYTHON_ENV_NAME} python=3.11 + conda activate ${PYTHON_ENV_NAME} + pip install mindspore + +You can also install the Python package adapted to your environment by referring to the official installation document. For details, see `MindSpore Installation `_. + +MindSpore inference mainly runs on the Ascend AI Processor environment. You need to install the corresponding Ascend development environment. For details, see the following: + +.. code:: shell + + pip install ${ASCEND_HOME}/lib64/te-*.whl + pip install ${ASCEND_HOME}/lib64/hccl-*.whl + pip install sympy + +If you need to reuse the tokenizer capability of the mainstream LLM, you can install the Transformers software package. + +.. code:: shell + + pip install transformers + +If you need to use model quantization to enhance inference performance, you need to install the mindspore_gs package. For details, see `Installing MindSpore Golden Stick `_. + +Weight Preparation +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Obtain the weight file of the LLM for weight preparation. In addition, each LLM usually has its own token list, which indicates a full set of words supported by the model. Therefore, you need to obtain the tokenizer mapping in addition to the model weight. MindSpore supports the direct loading of the safetensor weight file. You can directly download the model weight file from the Hugging Face official website. + +For the Qwen2 LLM, you are advised to use the pre-trained weight files and tokenizer mapping provided on the Hugging Face official website. You can run the following commands to download weights: + +.. code:: shell + + git lfs install + git clone https://huggingface.co/Qwen/Qwen2-7B-Instruct + +After the download is complete, the following file tree structure should be displayed in the related directory: + +.. code:: shell + + ls + |- config.json + |- LICENSE + |- merges.txt + |- model-00001-of-00004.safetensors + |- model-00002-of-00004.safetensors + |- model-00003-of-00004.safetensors + |- model-00004-of-00004.safetensors + |- model.safetensors.index.json + |- README.md + |- tokenizer_config.json + |- tokenizer.json + |- vocab.json + +Model Building +~~~~~~~~~~~~~~~~~~~~ + +You need to build a model and load the weight by running the following codes first: + +.. code:: python + + import os + import mindspore as ms + from qwen2 import Qwen2Config, Qwen2ForCausalLM, CacheManager + from mindspore import Tensor, mint + + # set mindspore context and envs + os.environ["MS_INTERNAL_DISABLE_CUSTOM_KERNEL_LIST"] = "PagedAttention" + + ms.set_context(infer_boost="on") + ms.set_context(mode=ms.context.PYNATIVE_MODE) + + model_path = "/path/to/model" + input_str = ["I love Beijing, because", "Hello, Qwen2"] + batch_size = len(input_str) + max_new_tokens = 64 + block_size = 128 + max_seq_lens = block_size * 10 + block_num = (max_seq_lens * batch_size) // block_size + + config = Qwen2Config.from_json(model_path + "/config.json") + + model = Qwen2ForCausalLM(config) + # load weight + model.load_weight(model_path) + + cache_manager = CacheManager(config, block_num, block_size, batch_size) + +Qwen2 is the network script (qwen2.py) of the model, which must be in the same directory as the current script. For details, see `Building an LLM Inference Network from Scratch <./ms_infer_network_develop.md>`_. You can also use other network scripts, but you need to modify the corresponding model APIs. + +The first step in the script is to set MindSpore environment variables, including: + +- **MS_INTERNAL_DISABLE_CUSTOM_KERNEL_LIST**: sets the TH flattening operator supported by MindSpore for PagedAttention. MindSpore only supports the TH format in dynamic graph mode. Therefore, if you want to develop in dynamic graph mode, you need to set this environment variable. You can also use the BSH format. + +- **infer_boost**: enables inference optimization. This optimization is mainly to enable MindSpore fusion operators such as FlashAttention and PagedAttention. + +- **mode**: sets the execution mode to dynamic graph mode. This mode is more convenient for debugging and development. You are advised to use this mode during model development. + +The second step in the script is to initialize the model and KVCache using the class provided by the model script **qwen2.py**. The following parameters are included: + +- **input_str**: specifies the original text to be inferred. A string list with **batch_size** set to **2** is passed at a time, indicating that two statements are inferred at the same time. + +- **model_path**: specifies the model directory path, that is, the path of the model downloaded from the Hugging Face official website. + +- **max_new_tokens**: specifies the maximum number of inference words. When the number of inference words reaches the maximum, the inference stops and is used in subsequent iterations. + +- **block_size**: specifies the block size of the KVCache object managed by PagedAttention. A smaller value of **block_size** indicates finer division and higher reuse probability of different requests. A larger value of **block_size** indicates that more valid data is read at a time during network computing, and the computing performance is better. + +- **max_seq_len**: specifies the maximum length supported by model inference. This parameter can be obtained from **config** and affects the graphics memory usage of KVCache. The Qwen2 configuration is large (32,000) by default. Therefore, this parameter is set to 10 times the value of **block_size** for simplification. + +Initialize the model based on the preceding parameters to obtain the model and cache_manager objects. + +Model Inference +~~~~~~~~~~~~~~~~~~~~ + +Once the model is built, you can utilize the model object for text generation, enabling applications such as self-service customer support, intelligent Q&A, and chatbots. However, the input of an application is usually a language text, which cannot be directly used as the input of the model for computation. Therefore, we need to add the preprocessing and postprocessing logic to convert the text language into token data that can be identified by the model. After the inference computation is complete, the token data is converted into the text language. The following uses a simple Q&A text generation as an example to describe the process. + +- **Preprocessing**: Use the tokenizer's data to break a sentence down into a list represented by multiple token IDs. In this case, the tokenizer of the open-source community Transformers is used. + + .. code:: python + + from transformers import AutoTokenizer + + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + + input_str = ["I love Beijing, because", "Hello, Qwen2"] + + input_ids = tokenizer(input_str)["input_ids"] + + print(input_ids) + + After the Python code is executed, the following information is displayed: + + .. code:: shell + + [[40, 2948, 26549, 11, 1576], [9707, 11, 1207, 16948, 17]] + + [40, 2948, 26549, 11, 1576] corresponds to the word sequence "I love Beijing, because". **40** indicates the token corresponding to "I", **2948** indicates the token corresponding to "love", **26549** indicates the token corresponding to "Beijing", **11** indicates the token corresponding to ", " (comma and space), and **1576** indicates the token corresponding to "because". This format can be directly passed to the model for inference. Similarly, [9707, 11, 1207, 16948, 17] corresponds to the input sequence "Hello, Qwen2". In this example, two requests are passed at a time for batch calculation. + +- **Entire network computing**: The data and configuration of the current input token are specified so that the model object can iteratively infer the token result of each step through multiple rounds of computation. To simplify the code, you can encapsulate the iterative inference into the following generate function: + + .. code:: python + + from typing import List + from mindspore import ops, mint, Tensor, dtype + from qwen2 import Qwen2Config, Qwen2ModelInput, Qwen2ForCausalLM, CacheManager, sample + + def generate(model: Qwen2ForCausalLM, cache_manager: CacheManager, input_ids: List, max_new_tokens: int, max_seq_lens: int, eos_token_id: int): + batch_size = len(input_ids) + assert max_seq_lens >= max(map(len, input_ids)) + + cur = min(map(len, input_ids)) + is_prefill = True + it = 0 + + decode_q_seq_lens = Tensor([1 for _ in range(batch_size)], dtype=dtype.int32) + decode_mask = ops.zeros((1, 1), dtype=config.param_dtype) + attn_mask = None + q_seq_lens = None + + while cur <= max_seq_lens and it < max_new_tokens: + batch_valid_length = Tensor([cur for _ in range(batch_size)], dtype=dtype.int32) + if is_prefill: + inp = Tensor([input_ids[i][:cur] for i in range(batch_size)], dtype=dtype.int32) + pos = mint.arange(cur).astype(dtype.int32) + block_tables, slot_mapping = cache_manager.step(0, cur) + attn_mask = ops.logical_not(ops.sequence_mask(pos + 1, cur)).astype(config.param_dtype) + q_seq_lens = None + else: + inp = Tensor([[input_ids[i][cur - 1]] for i in range(batch_size)], dtype=dtype.int32) + pos = Tensor([[cur - 1] for _ in range(batch_size)], dtype=dtype.int32).view(-1) + block_tables, slot_mapping = cache_manager.step(cur - 1, 1) + attn_mask = decode_mask + q_seq_lens = decode_q_seq_lens + + model_input = Qwen2ModelInput( + input_ids=inp, + positions=pos, + batch_valid_length=batch_valid_length, + is_prefill=is_prefill, + attn_mask=attn_mask, + k_caches=cache_manager.k_caches, + v_caches=cache_manager.v_caches, + block_tables=block_tables, + slot_mapping=slot_mapping, + q_seq_lens=q_seq_lens + ) + + logits = model(model_input) + + next_tokens = sample(logits) + + for i in range(batch_size): + if cur >= len(input_ids[i]): + input_ids[i].append(int(next_tokens[i])) + + cur += 1 + it += 1 + if is_prefill: + is_prefill = False + + for i in range(batch_size): + if eos_token_id in input_ids[i]: + eos_idx = input_ids[i].index(eos_token_id) + input_ids[i] = input_ids[i][: eos_idx + 1] + + return input_ids + + The generate function simulates the iteration process of LLM inference. The core steps are as follows: + + 1. **Model input preparation**: Prepare the input data required for model inference and construct the Qwen2ModelInput object. The main parameters are as follows: + + **input_ids**: specifies the list of input vocabulary IDs. Each batch is represented by a list. + + **positions**: specifies position information of the input vocabulary in the inference statement, which is mainly used for RoPE. + + **batch_valid_length**: specifies the length of the current inference statement, which is used to obtain the KV of KVCache. Generally, the value is the value of **positions** plus 1. In speculative inference scenarios, the value may be greater than the value of **positions** plus 1. + + **is_prefill**: specifies whether full inference is performed. Full inference needs to compute multiple KVs. Incremental inference can reuse the KV results computed in the previous computation, and only the last KV needs to be computed. + + **attn_mask**: hides unnecessary information during attention score computation. It is usually a standard matrix with an upper or lower triangle (valid elements are marked with **1** and others are **0**). + + **kv_caches**: specifies the KVCache object, which stores all computed KV results. + + **block_tables&slot_mapping**: specifies the KVCache information used by the current inference vocabulary. **block_tables** indicates the block used by each batch, and **slot_mapping** indicates the position of the corresponding word in the block. For example, if **block_tables** is **[2, 10]**, **slot_mapping** is **[1200]**, and **block_size** is **128**, the second and tenth blocks are used for inference, and the 1200th block unit is used for the current word, that is, the KV of the 48th unit in the tenth block. + + **q_seq_lens**: specifies the length of the query in attention, which is mainly used by the PagedAttention operator. The value is **1** in the standard model, and may be greater than 1 in speculative inference scenarios. + + 2. **Model calculation**: Call the main model network to start the model computation logic and compute the probability distribution of the next word. + + 3. **Sampling result**: Obtain the ID of the next word through sampling computing (**argmax** is used as an example, that is, the word with the highest probability is selected). + + 4. **Input update of the next iteration**: Update the word list of the next iteration and enter the next iteration. + + After the iteration is complete, you can optimize the model. The model inference ends based on the number of inference words. The inference result may be suddenly interrupted. Therefore, you can use the tokenizer's sentence segmentation table ID to enclose the result at the position of the last sentence segmentation (for example, period) to enhance the readability of the text result. After the encapsulation is complete, you can call the word generation process using the following code: + + .. code:: python + + output = generate( + model=model, + cache_manager=cache_manager, + input_ids=input_ids, + max_new_tokens=max_new_tokens, + eos_token_id=tokenizer.eos_token_id, + max_seq_lens=max_seq_lens + ) + +- **Postprocessing**: Based on the network inference output, use the conversion capability of the tokenizer to convert the token ID list into a comprehensible statement. + + .. code:: python + + result = [tokenizer.decode(a) for a in output] + print(result) + + After the Python code is executed, the following information is displayed: + + .. code:: shell + + I love Beijing, because it is a city that is constantly changing. I have been living here for 10 years and I have seen the city changes so much. ... + + It can be seen that the model-inferred token IDs are translated to a human-readable statement. In actual verification, due to the randomness of **do_sample**, each inference is different, but the result logic is basically understandable. + + For details about the complete end-to-end example, see `infer.py `_. + +Model Parallelism +~~~~~~~~~~~~~~~~~~~~ + +For LLMs with many model parameters, such as Llama2-70B and Qwen2-72B, the parameter scale usually exceeds the memory capacity of a GPU or NPU. Therefore, multi-device parallel inference is required. MindSpore LLM inference can shard the original LLM into *N* parallel models so that they can be executed on multiple devices in parallel. This not only enables inference for super LLMs but also enhances performance by leveraging more resources from the multiple devices. The model scripts provided by the MindFormers model suite can be used to shard a model into multi-device models for execution. + +Currently, mainstream model parallel methods include the following: + +- **Data parallelism**: The data to be computed is divided into multiple parallel parts and computed on multiple devices in parallel. In the inference scenario, multiple statements can be computed in parallel through batch processing. Data parallelism can be understood as multiple model instances executed in parallel, and therefore no additional model adaptation is required. + +- **Tensor parallelism**: The operators to be computed by the model are sharded according to the network script definition. In the inference scenario, the number of shards is usually equal to the number of devices. The input and output of operator computation in the network change with the parallelism degree. Therefore, the model needs to be adapted to the parallelism. + +- **Pipeline parallelism**: The model is sharded into multiple instances based on the number of layers. Pipeline computation can be implemented between multiple requests. The network is sharded into multiple subnets. Therefore, the model needs to be adapted to the parallelism. + +- **Expert parallelism**: This is a parallel strategy specific to MoE LLMs. Different expert computations are distributed to different compute entities in parallel, and the computing performance is improved through concurrent expert control. + +To more clearly describe the model parallel computing process, this section describes the most basic and common model parallel policies. You can implement parallel adaptation of the model by performing the following steps: + +1. **Model adaptation**: When a MindSpore LLM is running on multiple devices, model parallelism is usually used. Therefore, the original model needs to be sharded based on the number of devices. For example, the matrix multiplication of [1024, 4096] and [4096, 2048] can be sharded into two matrix multiplications of [1024, 4096] and [4096, 1024], respectively. + Different sharding policies may bring different parallel computing performance. + For Qwen and Llama, the sharding mainly involves the linear operations on the query, key, and value data at the attention layer. + +2. **Weight adaptation**: In addition to the parallel reconstruction of the model structure, the weights in the model computation are also sharded. Therefore, the related weights need to be sharded during model loading to minimize the graphics memory occupied by unnecessary weight loading. For LLMs, the main weights are concentrated on the embedding and linear network layers. Therefore, the weight loading adaptation mainly involves the reconstruction of the two modules. + +3. **Model inference**: Unlike single-device inference, multi-device inference requires multiple processes to be started at the same time for parallel inference. Therefore, when starting model inference, multi-device inference requires running multiple groups of related processes at a time, instead of directly running scripts. The MindSpore framework provides the msrun parallel running tool. For details, see `Building a Parallel LLM Network <./ms_infer_parallel_infer.md>`_. + +Model Quantization +~~~~~~~~~~~~~~~~~~~~ + +The MindSpore LLM supports the following quantization technologies to improve the inference performance: + +- **A16W8/A16W4 quantization**: quantizes the weights of an LLM, saving float16 weights as 8-bit int8 or 4-bit int4 data. Before computation, the weights are de-quantized back to float16, reducing memory usage, enhancing model concurrency, and improving inference throughput. + +- **A8W8 quantization**: quantizes the entire network of an LLM, converting float16 activations to 8-bit int8 data for computation. This doubles the computational efficiency of GPU or NPU computing units (for example, from 16 x 16 to 32 x 16). Specific quantization operators are required. This not only reduces memory usage but also significantly enhances computational performance. + +- **KVCache quantization**: reduces graphics memory consumption, effectively enhancing overall throughput. (KVCache consumes considerable graphics memory and model weights in LLM inference.) MindSpore supports quantizing KVCache from float16 to int8. Through flash attention and page attention, quantization and dequantization are fused into operators to reduce the overhead caused by quantization and improve the overall throughput. + +To quantize a model using golden-stick, perform the following steps: + +1. **Weight quantization**: Use a quantization algorithm to convert the model weight data from float16 to int8. + +2. **Model inference**: Load the standard model, quantize the model network (by inserting corresponding quantization operators), load the quantized weight, and call the model inference. + +For details about model quantization, see `Quantization <./ms_infer_quantization>`_. + +Advanced Usage +----------------- + +- **Using custom operators to optimize model inference** + + The MindSpore LLM inference supports the use of custom operators to optimize operators in specific scenarios or implement operator fusion on the network. Custom operators can be enabled or disabled by simply modifying the operator API in the network script. For details, see `Custom Operators <../../custom_program/operation/op_custom_ascendc.md>`_. + +- **Offline inference of LLMs** + + Given the substantial size of LLMs, you are advised to use more flexible online inference (weight CKPT and network script) for MindSpore LLM inference. However, in specific scenarios, such as running device or edge LLMs with limited running environments lacking Python or MindSpore packages, you can use the MindSpore Lite offline inference solution. + + In this case, you need to export the model to a MindIR file, which is the unified model expression of MindSpore, and send the file to the MindSpore Lite runtime. For details, see `Lite Inference Overview <../lite_infer/overview.md>`_. diff --git a/tutorials/source_en/model_infer/ms_infer/ms_infer_model_serving_infer.md b/tutorials/source_en/model_infer/ms_infer/ms_infer_model_serving_infer.md new file mode 100644 index 0000000000000000000000000000000000000000..7fef1b3ad25238ae825a00227cf79cdc144562d8 --- /dev/null +++ b/tutorials/source_en/model_infer/ms_infer/ms_infer_model_serving_infer.md @@ -0,0 +1,166 @@ + +# Service-oriented Model Inference + +[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/master/tutorials/source_en/model_infer/ms_infer/ms_infer_model_serving_infer.md) + +## Background + +MindSpore is an AI model development framework that provides efficient model development capabilities. Generally, the following code is used for model inference: + +```python + +input_str = "I love Beijing, because" + +model = Qwen2Model(config) +model.load_weight("/path/to/model") + +input_ids = tokenizer(input_str)["input_ids"] + +logits = model(input_ids) + +next_token = ops.argmax(logits) + +generate_text = tokenizer.decode(next_token) + +print(generate_text) +``` + +This model inference mode is simple, but the model and weight need to be reloaded each time inference is performed. As a result, the inference efficiency is low in actual applications. To solve this problem, a model inference backend service is usually deployed to receive inference requests online and send requests to the model for computing. This inference mode is called service-oriented inference. MindSpore does not provide the service-oriented inference capability. If service-oriented inference is required in actual applications, you need to develop a service backend and integrate the related model. + +To help users easily deploy out-of-the-box model inference capabilities in the production environment, MindSpore provides full-stack service-oriented model inference capabilities based on the popular vLLM model inference open-source software. Service-oriented inference supports real-time online inference and efficiently improves the overall throughput of model inference and reduces inference costs through efficient user request scheduling. + +## Main Features + +As an efficient service-oriented model inference backend, it should provide the following capabilities to maximize the deployment and running efficiency of models: + +- **Quick startup**: Quick loading and initialization of LLMs are implemented through technologies such as compilation cache and parallel loading, reducing the extra startup overhead caused by the continuous increase of model weights. + +- **Batch inference**: A proper batch grouping mechanism is used to implement optimal user experience in the case of massive concurrent requests. + +- **Efficient scheduling**: Full and incremental request scheduling is used to address full and incremental inference requirements of LLMs, maximizing resource computing efficiency and improving system throughput. + +## Inference Tutorial + +MindSpore inference works with the vLLM community solution to provide users with full-stack end-to-end inference service capabilities. The vLLM MindSpore adaptation layer implements seamless interconnection of the vLLM community service capabilities in the MindSpore framework. For details, see [vLLM MindSpore](https://www.mindspore.cn/vllm_mindspore/docs/en/master/index.html). + +This section describes the basic usage of vLLM MindSpore service-oriented inference. + +### Setting Up the Environment + +The vLLM MindSpore adaptation layer provides an environment installation script. You can run the following commands to create a vLLM MindSpore operating environment: + +```shell +# download vllm-mindspore code +git clone https://gitee.com/mindspore/vllm-mindspore.git +cd vllm-mindspore + +# create conda env +conda create -n vllm-mindspore-py311 python=3.11 +conda activate vllm-mindspore-py311 + +# install extra dependent packages +pip install setuptools_scm +pip install numba + +# run install dependences script +bash install_depend_pkgs.sh + +# install vllm-mindspore +python setup.py install +``` + +After the vLLM MindSpore operating environment is created, you need to install the following dependency packages: + +- **mindspore**: MindSpore development framework, which is the basis for model running. + +- **vLLM**: vLLM service software. + +- **vllm-mindspore**: vLLM extension that adapts to the MindSpore framework. It is required for running MindSpore models. + +- **msadapter**: adaptation layer for MindSpore to connect to PyTorch. Some vLLM functions depend on the PyTorch capabilities and need to be adapted by MSAdapter. + +- **golden-stick**: MindSpore model quantization framework. If the quantization capability is required, install this software. + +- **mindformers**: Transformer model library provided by the MindSpore framework. You can use the models directly or connect to the native models of MindSpore. + +### Preparing a Model + +The service-oriented vLLM MindSpore supports the direct running of the native Hugging Face model. Therefore, you can directly download the model from the Hugging Face official website. The following uses the Qwen2-7B-Instruct model as an example: + +```shell +git lfs install +git clone https://huggingface.co/Qwen/Qwen2-7B-Instruct +``` + +If `git lfs install` fails during the pull process, refer to the vLLM MindSpore FAQ for a solution. + +### Starting a Service + +Before starting the backend service, you need to set the environment variables based on the actual environment. + +```shell +# set Ascend CANN tools envs +source /usr/local/Ascend/ascend-toolkit/set_env.sh +export ASCEND_CUSTOM_PATH=${ASCEND_HOME_PATH}/../ +export ASCEND_RT_VISIBLE_DEVICES=3 +export ASCEND_TOTAL_MEMORY_GB=32 + +# mindspore envs +export MS_ALLOC_CONF=enable_vmm:true +export CPU_AFFINITY=0 + +# vLLM envs +export VLLM_MODEL_MEMORY_USE_GB=26 + +# backend envs +export VLLM_MASTER_IP=127.0.0.1 +export VLLM_RPC_PORT=12390 +export VLLM_HTTP_PORT=8080 +unset vLLM_MODEL_BACKEND + +# model envs +export MODEL_ID="/path/to/model/Qwen2-7B-Instruct" +``` + +Run the following command to start the vLLM MindSpore service backend: + +```shell +vllm-mindspore serve --model=${MODEL_ID} --port=${VLLM_HTTP_PORT} --trust_remote_code --max-num-seqs=256 --max_model_len=32768 --max-num-batched-tokens=4096 --block_size=128 --gpu-memory-utilization=0.9 --tensor-parallel-size 1 --data-parallel-size 1 --data-parallel-size-local 1 --data-parallel-start-rank 0 --data-parallel-address ${VLLM_MASTER_IP} --data-parallel-rpc-port ${VLLM_RPC_PORT} &> vllm-mindspore.log & +``` + +After the backend service is loaded, the listening port and provided APIs of the backend service are displayed. + +### Sending a Request + +You can run the following command to send an HTTP request to implement model inference: + +```shell +curl http://${VLLM_MASTER_IP}:${VLLM_HTTP_PORT}/v1/completions -H "Content-Type: application/json" -d "{\"model\": \"${MODEL_ID}\", \"prompt\": \"I love Beijing, because\", \"max_tokens\": 128, \"temperature\": 1.0, \"top_p\": 1.0, \"top_k\": 1, \"repetition_penalty\": 1.0}" +``` + +After receiving the inference request, the service backend calculates and returns the following results: + +```json +{ + "id":"cmpl-1c30caf453154b5ab4a579b7b06cea19", + "object":"text_completion", + "created":1754103773, + "model":"/path/to/model/Qwen2-7B-Instruct", + "choices":[ + { + "index":0, + "text":" it is a city with a long history and rich culture. I have been to many places of interest in Beijing, such as the Great Wall, the Forbidden City, the Summer Palace, and the Temple of Heaven. I also visited the National Museum of China, where I learned a lot about Chinese history and culture. The food in Beijing is also amazing, especially the Peking duck and the dumplings. I enjoyed trying different types of local cuisine and experiencing the unique flavors of Beijing. The people in Beijing are friendly and welcoming, and they are always willing to help tourists. I had a great time exploring the city and interacting with the locals", + "logprobs":null, + "finish_reason":"length", + "stop_reason":null, + "prompt_logprobs":null + } + ], + "usage":{ + "prompt_tokens":5, + "total_tokens":133, + "completion_tokens":128, + "prompt_tokens_details":null + } +} +``` diff --git a/tutorials/source_en/model_infer/ms_infer/ms_infer_network_develop.md b/tutorials/source_en/model_infer/ms_infer/ms_infer_network_develop.md new file mode 100644 index 0000000000000000000000000000000000000000..86ca7cb2ac0181e2b9753fd25d8a89d84241b752 --- /dev/null +++ b/tutorials/source_en/model_infer/ms_infer/ms_infer_network_develop.md @@ -0,0 +1,723 @@ +# Building an LLM Inference Network from Scratch + +[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/master/tutorials/source_en/model_infer/ms_infer/ms_infer_network_develop.md) + +## Model Development Modes + +MindSpore provides two model running modes: + +- **Static graph mode**: The model network is compiled into a complete network graph for convergence and optimization, improving the model execution performance. However, due to some syntax support issues, model development has certain limitations, affecting the usability. + +- **Dynamic graph mode**: Python statements of network scripts are executed one by one, facilitating printing and debugging (by using the PDB) at any time. This mode is easy to use, but its performance is not as good as that of the static graph mode. + +In MindSpore, you are advised to use the dynamic graph mode to develop a model and then convert dynamic graphs to static graphs as required to obtain the maximum model performance. + +## Backbone Network Used for Development in Dynamic Graph Mode + +Most mainstream LLMs use the Transformer-based backbone network, where core computing relies on the self-attention mechanism. The following figure uses the Qwen2 LLM as an example to show the backbone network architecture. + +![Qwen2 network architecture](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/tutorials/source_zh_cn/model_infer/ms_infer/images/llm_qwen2_network_arch.png) + +The core layer of Qwen2 consists of the following parts: + +- **Embedding**: converts the index corresponding to each token into a vector to implement feature dispersion. Similar to one-hot vectorization, the embedding weights are involved in the training process, which can better adapt to the context semantics in the LLM. This process is implemented through the embedding operator. + +- **DecodeLayer**: refers to the Transformer structure, which is a key compute module of the LLM. Generally, multiple layers of computation are configured as needed. Each layer is actually a Transformer structure. + +- **RmsNorm & Linear**: linearly normalizes the output of each layer to the same dimension as the model vocabulary after computation by the transformer structure and returns the probability distribution of each token. + +You can use the MindSpore LLM to build a network for inference. The network can be assembled as required using operators provided by MindSpore. The following uses the Qwen2 model as an example to describe how to build a model. For details about the complete end-to-end example, see [qwen2.py](https://gitee.com/mindspore/docs/blob/master/docs/sample_code/infer_code/qwen2/qwen2.py). + +### Basic Common Network Layer + +The Qwen2 LLM has many configurations and parameters. To manage them more conveniently, you need to define the Config and Input classes to be used by the model. In addition, note that the Linear and RmsNorm operators are frequently used in each functional layer of the network. You can build these common layers in advance. + +#### Config & Input + +```python +import json +from dataclasses import dataclass +from typing import Optional, Type, List, Tuple, Union + +from mindspore import Tensor, dtype + +@dataclass +class Qwen2Config: + """Qwen2 Config, the key-value is almost the same with config.json in Hugging Face""" + architectures: Optional[List[str]] = None + attention_dropout: float = 0.0 + bos_token_id: int = 151643 + eos_token_id: int = 151645 + hidden_act: str = "silu" + hidden_size: int = 3584 + initializer_range: float = 0.02 + intermediate_size: int = 18944 + max_position_embeddings: int = 32768 + max_window_layers: int = 28 + model_type: str = "qwen2" + num_attention_heads: int = 28 + num_hidden_layers: int = 28 + num_key_value_heads: int = 4 + rms_norm_eps: float = 1e-06 + rope_theta: float = 1000000.0 + sliding_window: Optional[int] = 131072 + tie_word_embeddings: bool = False + torch_dtype: str = "bfloat16" + transformers_version: str = "4.41.2" + use_cache: bool = True + use_sliding_window: bool = False + vocab_size: int = 152064 + param_dtype: Optional[Type] = dtype.bfloat16 # this is mindspore datatype as hugging face use str as dtype + + @classmethod + def from_json(cls, json_path: str) -> 'Qwen2Config': + with open(json_path) as f: + data = json.load(f) + config = cls(**data) + return config + + +@dataclass +class Qwen2ModelInput: + input_ids: Tensor + positions: Tensor + batch_valid_length: Tensor + is_prefill: bool + attn_mask: Tensor + k_caches: List[Tensor] + v_caches: List[Tensor] + slot_mapping: Tensor = None + block_tables: Tensor = None + hidden_state: Optional[Tensor] = None + residual: Optional[Tensor] = None + q_seq_lens: Optional[Tensor] = None +``` + +The Qwen2Config configuration is basically the same as that of Hugging Face. For details, see the official Qwen2 documentation. Note that **param_dtype** is used to replace **torch_dtype** in Qwen2Config because the data types of MindSpore are different from those of PyTorch. Qwen2ModelInput defines the model input, including the word ID, KVCache, and Attention fused operator, which are required by MindSpore inference optimization features. + +#### RmsNorm + +RmsNorm is a normalization algorithm commonly used in most LLMs. MindSpore provides operators that can be directly used. You only need to create the corresponding weights. In addition, RmsNorm often involves residual computing. The RmsNorm class implements residual converged computing at the network layer. The following is a code example: + +```python +from typing import Optional, Type, Union, Tuple + +from mindspore import nn, ops, mint, Parameter, Tensor + +class RmsNorm(nn.Cell): + def __init__(self, config: Qwen2Config) -> None: + super().__init__() + + self.rms_norm = ops.RmsNorm(config.rms_norm_eps) + + self.weight = Parameter( + mint.ones( + config.hidden_size, + dtype=config.param_dtype + ), + requires_grad=False + ) + + def construct(self, x: Tensor, residual: Optional[Tensor] = None) -> Union[Tensor, Tuple[Tensor, Tensor]]: + if residual is not None: + x = x + residual + residual = x + output = self.rms_norm(x, self.weight)[0] + if residual is None: + return output + return output, residual +``` + +#### Linear + +The Linear layer is actually a linear transformation. Its main computing logic is matrix multiplication (MatMul). However, bias correction may be required for addition depending on the specific application scenario (bias is required during query, key, and value conversion). The following code integrates these computations into a network structure: + +```python +from typing import Optional, Type + +from mindspore import nn, ops, mint, Parameter, Tensor + +class Qwen2Linear(nn.Cell): + def __init__(self, input_size: int, output_size: int, param_dtype: Optional[Type], enable_bias: bool) -> None: + super().__init__() + + self.param_dtype = param_dtype + self.input_size = input_size + self.output_size = output_size + self.enable_bias = enable_bias + + self.matmul = ops.MatMul(transpose_b=True) + self.weight = Parameter( + mint.zeros( + (self.output_size, self.input_size), + dtype=self.param_dtype + ), + requires_grad=False + ) + + if self.enable_bias: + self.bias_add = ops.Add() + self.bias = Parameter( + mint.zeros(self.output_size, dtype=self.param_dtype) + ) + + def construct(self, input: Tensor): + origin_shape = input.shape + x = self.matmul(input.view(-1, origin_shape[-1]), self.weight) + if self.enable_bias: + x = self.bias_add(x, self.bias) + return x.view(*origin_shape[:-1], -1) +``` + +Because multi-batch computation is required, the input **shape** may be *n* times of **input_size**. To ensure correct computation, the original input **shape** is saved. After the computation is complete, **shape** is restored through the view. + +### Qwen2ForCausalLM + +The Qwen2 model is usually encapsulated for specific services. For example, Qwen2ForCausalLM is an encapsulation of Qwen2 for language processing and dialog services. + +The Qwen2ForCausalLM class is used to clearly define the main APIs of the model. The following shows the specific implementation: + +```python +from glob import glob +from typing import Optional, Type + +from mindspore import nn, Tensor, load_checkpoint, load_param_into_net + +class Qwen2ForCausalLM(nn.Cell): + def __init__(self, config: Qwen2Config) -> None: + super().__init__() + + self.model = Qwen2Model(config=config) + self.lm_head = Qwen2Linear( + input_size=config.hidden_size, + output_size=config.vocab_size, + param_dtype=config.param_dtype, + enable_bias=False + ) + + def load_weight(self, weight_path: str) -> None: + weight_dict = {} + for path in glob(weight_path + "/*.safetensors"): + weight_dict.update(load_checkpoint(path, format="safetensors")) + + load_param_into_net(self, weight_dict, strict_load=False) + + def construct(self, model_input: Qwen2ModelInput) -> Tensor: + hidden_state = self.model(model_input.input_ids, model_input.positions, + model_input.batch_valid_length, model_input.is_prefill, + model_input.k_caches, model_input.v_caches, model_input.slot_mapping, + model_input.block_tables, model_input.attn_mask, model_input.q_seq_lens) + logits = self.lm_head(hidden_state)[:, -1] + return logits +``` + +As shown in the code, Qwen2ForCausalLM has two core APIs: + +- load_weight: loads weights from the Hugging Face official website model and injects them into the model based on the network script. + +- construct: performs inference and computing, and calls submodules to complete computing layer by layer. + As shown in the construct, the core of the model is the backbone network computing and the linear computing of the last **lm_head**, which converts the features of **hidden_size** into the vocabulary probability distribution of **vocab_size**. + +### Qwen2Model + +Qwen2Model is the main network of the Qwen2 model. It consists of two parts: the embedding layer that converts the input into features and the decoder structure of *n* Transformer layers. + +#### Embedding + +The logic of the embedding layer is simple. It obtains the feature data (which is also a part of the training weight) of **hidden_size** based on the input word ID through a gather operator. The code is as follows: + +```python +from typing import Optional, Type + +from mindspore import nn, ops, mint, Parameter, Tensor + +class VocabEmbedding(nn.Cell): + def __init__(self, config: Qwen2Config) -> None: + super().__init__() + + self.num_embeddings = config.vocab_size + self.embedding_dim = config.hidden_size + + self.gather = ops.Gather() + + self.weight = Parameter( + mint.zeros( + (self.num_embeddings, self.embedding_dim), + dtype=config.param_dtype + ), + requires_grad=False + ) + + def construct(self, input_ids: Tensor): + return self.gather(self.weight, input_ids, 0) +``` + +#### DecoderLayer + +DecoderLayer is the core computing unit of the Transformer network. Most of the computing operations are performed at this layer. As shown in the Qwen2 network structure diagram, the network layers include RoPE, Attention, and MLP. To facilitate development, these network layers are constructed first. + +- **RoPE** + + The rotary position embedding (RoPE) operator is used to enhance the Attention mechanism's capability to perceive the distance between words by adding positional encoding information to the features of the query and key. Due to the features of RoPE, the result can be pre-computed and directly obtained by querying the table, thereby achieving efficient computation. This can be implemented using the gather and the RoPE operators. For details about the calculation method, see the related documents of RoPE. + + ```python + import numpy as np + from typing import Optional, Type + + from mindspore import nn, ops, mint, Parameter, Tensor + + class Qwen2RotaryEmbedding(nn.Cell): + def __init__(self, head_size: int, rotary_dim: int, max_position_embeddings: int, base: int, dtype: Optional[Type]) -> None: + super().__init__() + + self.head_size = head_size + self.rotary_dim = rotary_dim + self.max_position_embeddings = max_position_embeddings + self.base = base + self.dtype = dtype + + # format 2 is neox style + self.rotary_embedding_op = ops.ApplyRotaryPosEmb(2) + self.gather = ops.Gather() + + self.freqs_cos, self.freqs_sin = self._compute_cos_sin_cache() + + def _compute_inv_freq(self) -> Tensor: + freqs_base = mint.arange(0, self.rotary_dim, 2).astype(np.float32) + freqs = 1.0 / (self.base ** (freqs_base / self.rotary_dim)) + return freqs + + def _compute_cos_sin_cache(self) -> Tuple[Tensor, Tensor]: + freqs = self._compute_inv_freq() + t = np.arange(0, self.max_position_embeddings, 1).astype(np.float32) + freqs = np.outer(t, freqs) + emb = np.concatenate((freqs, freqs), axis=1) + freqs_cos = np.cos(emb) + freqs_sin = np.sin(emb) + + freqs_cos = Tensor(freqs_cos, dtype=self.dtype) + freqs_sin = Tensor(freqs_sin, dtype=self.dtype) + return freqs_cos, freqs_sin + + def construct(self, positions: Tensor, query: Tensor, key: Tensor, batch_valid_length: Tensor, is_prefill: bool): + query = query.contiguous() + key = key.contiguous() + + if is_prefill: + freqs_cos = self.freqs_cos + freqs_sin = self.freqs_sin + else: + freqs_cos = self.gather(self.freqs_cos, positions.view(-1), 0) + freqs_sin = self.gather(self.freqs_sin, positions.view(-1), 0) + + return self.rotary_embedding_op(query, key, freqs_cos, freqs_sin, batch_valid_length) + ``` + +- **Attention** + + An attention layer consists of multiple Linear and RoPE operators, and attention score calculation. MindSpore provides two fusion operators, FlashAttention and PagedAttention, to enhance the inference performance of attention score calculation. + + However, because these native operators are oriented to multiple scenarios and the input is complex, they are encapsulated here to simplify the usage. For details about the code, see the following: + + ```python + import numpy as np + from typing import Optional, Type + + from mindspore import nn, ops, mint, Parameter, Tensor + + class FlashAttention(nn.Cell): + def __init__(self, scale: float, num_heads: int) -> None: + super().__init__() + + input_layout = "TH" + scale = scale + pre_tokens = 2147483647 + next_tokens = 2147483647 + self.flash_attention = ops.operations.nn_ops.FlashAttentionScore(head_num=num_heads, + scale_value=scale, + pre_tokens=pre_tokens, + next_tokens=next_tokens, + input_layout=input_layout) + + def construct(self, q: Tensor, k: Tensor, v: Tensor, attn_mask: Tensor, batch_valid_length: Tensor) -> Tensor: + _, _, _, output = self.flash_attention( + q, + k, + v, + None, + None, + None, + attn_mask, + None, + batch_valid_length, + batch_valid_length + ) + return output + + + class PagedAttention(nn.Cell): + def __init__(self, head_num: int, scale: float, num_kv_heads: int) -> None: + super().__init__() + + self.head_num = head_num + self.num_kv_heads = num_kv_heads + + self.paged_attention = ops.auto_generate.PagedAttention( + head_num=head_num, + scale_value=scale, + kv_head_num=num_kv_heads + ) + + def construct(self, q: Tensor, k_cache: Tensor, v_cache: Tensor, + block_tables: Tensor, batch_valid_length: Tensor, + attn_mask: Tensor, q_seq_lens: Tensor) -> Tensor: + output = self.paged_attention(q, k_cache, v_cache, block_tables, batch_valid_length, None, None, attn_mask, q_seq_lens) + return output + ``` + + The code of the attention layer may be implemented by using the constructed network layer. For details about the code, see the following: + + ```python + import numpy as np + from typing import Optional, Type + + from mindspore import nn, ops, mint, Parameter, Tensor + + + class Qwen2Attention(nn.Cell): + def __init__(self, config: Qwen2Config) -> None: + super().__init__() + + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.num_kv_heads = config.num_key_value_heads + self.head_dim =config.hidden_size // self.num_heads + self.q_size = self.head_dim * self.num_heads + self.kv_size = self.head_dim * self.num_kv_heads + self.scaling = float(self.head_dim ** -0.5) + self.rope_theta = int(config.rope_theta) + self.param_dtype = config.param_dtype + self.max_position = config.max_position_embeddings + + self.flash_attn = FlashAttention(self.scaling, self.num_heads) + self.paged_attn = PagedAttention(self.num_heads, self.scaling, self.num_kv_heads) + self.reshape_and_cache = ops.auto_generate.ReshapeAndCache() + + self.q_proj = Qwen2Linear( + input_size=self.hidden_size, + output_size=self.q_size, + param_dtype=self.param_dtype, + enable_bias=True + ) + self.k_proj = Qwen2Linear( + input_size=self.hidden_size, + output_size=self.kv_size, + param_dtype=self.param_dtype, + enable_bias=True + ) + self.v_proj = Qwen2Linear( + input_size=self.hidden_size, + output_size=self.kv_size, + param_dtype=self.param_dtype, + enable_bias=True + ) + self.o_proj = Qwen2Linear( + input_size=self.q_size, + output_size=self.hidden_size, + param_dtype=self.param_dtype, + enable_bias=False + ) + + self.rotary_emb = Qwen2RotaryEmbedding( + head_size=self.head_dim, + rotary_dim=self.head_dim, + max_position_embeddings=self.max_position, + base=self.rope_theta, + dtype=self.param_dtype + ) + + def construct(self, hidden_state: Tensor, positions: Tensor, batch_valid_length: Tensor, + is_prefill: bool, layer_idx: int, k_cache: Tensor, v_cache: Tensor, + slot_mapping: Tensor, block_tables: Tensor, attn_mask: Tensor, + q_seq_lens: Tensor) -> Tensor: + bs, seq_len, hidden_dim = hidden_state.shape + + q = self.q_proj(hidden_state).view(-1, self.q_size) + k = self.k_proj(hidden_state).view(-1, self.kv_size) + v = self.v_proj(hidden_state).view(-1, self.kv_size) + + q, k = self.rotary_emb( + positions, + q, + k, + batch_valid_length, + is_prefill + ) + + k = k.contiguous() + v = v.contiguous() + + cache_out = self.reshape_and_cache( + k, + v, + k_cache, + v_cache, + slot_mapping + ) + q = ops.depend(q, cache_out) + + if is_prefill: + attn_output = self.flash_attn( + q, + k, + v, + attn_mask, + batch_valid_length + ) + else: + attn_output = self.paged_attn( + q, + k_cache, + v_cache, + block_tables, + batch_valid_length, + attn_mask, + q_seq_lens + ) + + output = self.o_proj(attn_output).view(bs, seq_len, -1) + return output + ``` + +- **MLP** + + An MLP layer, consisting of multiple Linear operators and an activation function (usually silu), is responsible for implementing non-linear computation of the network. The MLP layer can project problems to multiple non-linear spaces, thereby enhancing network capabilities. For details about the implementation, see the following code: + + ```python + import numpy as np + from typing import Optional, Type + + from mindspore import nn, ops, mint, Parameter, Tensor + + class Qwen2MLP(nn.Cell): + def __init__(self, config: Qwen2Config) -> None: + super().__init__() + + self.up_proj = Qwen2Linear( + input_size=config.hidden_size, + output_size=config.intermediate_size, + param_dtype=config.param_dtype, + enable_bias=False + ) + self.gate_proj = Qwen2Linear( + input_size=config.hidden_size, + output_size=config.intermediate_size, + param_dtype=config.param_dtype, + enable_bias=False + ) + self.down_proj = Qwen2Linear( + input_size=config.intermediate_size, + output_size=config.hidden_size, + param_dtype=config.param_dtype, + enable_bias=False + ) + self.act_fn = ops.silu + + def construct(self, x: Tensor) -> Tensor: + output = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + return output + ``` + +DecoderLayer may be constructed as follows by referring to the preceding network layer: + +```python +from typing import Tuple +from mindspore import nn, Tensor + +class Qwen2DecoderLayer(nn.Cell): + def __init__(self, config: Qwen2Config) -> None: + super().__init__() + + self.hidden_size = config.hidden_size + + self.self_attn = Qwen2Attention(config=config) + self.mlp = Qwen2MLP(config=config) + self.input_layernorm = RmsNorm(config=config) + self.post_attention_layernorm = RmsNorm(config=config) + + def construct(self, hidden_state: Tensor, residual: Tensor, positions: Tensor, + batch_valid_length: Tensor, is_prefill: bool, layer_idx: int, + k_cache: Tensor, v_cache: Tensor, slot_mapping: Tensor, + block_tables: Tensor, attn_mask: Tensor, q_seq_lens: Tensor) -> Tuple[Tensor, Tensor]: + if residual is None: + residual = hidden_state + hidden_state = self.input_layernorm(hidden_state) + else: + hidden_state, residual = self.input_layernorm(hidden_state, residual) + + hidden_state = self.self_attn(hidden_state, positions, batch_valid_length, is_prefill, + layer_idx, k_cache, v_cache, slot_mapping, block_tables, + attn_mask, q_seq_lens) + hidden_state, residual = self.post_attention_layernorm(hidden_state, residual) + hidden_state = self.mlp(hidden_state) + + return hidden_state, residual +``` + +#### Model + +After the embedding and decoder layers are constructed, you can construct the Model class by referring to the following code: + +```python +from mindspore import nn, ops, mint, Parameter, Tensor + +class Qwen2Model(nn.Cell): + def __init__(self, config: Qwen2Config) -> None: + super().__init__() + + self.vocab_size = config.vocab_size + self.hidden_size = config.hidden_size + self.num_hidden_layers = config.num_hidden_layers + + self.embed_tokens = VocabEmbedding(config=config) + self.layers = nn.CellList() + for i in range(config.num_hidden_layers): + layer = Qwen2DecoderLayer(config=config) + self.layers.append(layer) + self.norm = RmsNorm(config=config) + + def construct(self, input_ids: Tensor, positions: Tensor, batch_valid_length: Tensor, + is_prefill: bool, k_caches: List[Tensor], v_caches: List[Tensor], + slot_mapping: Tensor, block_tables: Tensor, attn_mask: Tensor, + q_seq_lens: Tensor) -> Tensor: + hidden_state = self.embed_tokens(input_ids) + residual = None + + for i in range(self.num_hidden_layers): + layer = self.layers[i] + hidden_state, residual = layer(hidden_state, residual, positions, batch_valid_length, + is_prefill, i, k_caches[i], v_caches[i], slot_mapping, + block_tables, attn_mask, q_seq_lens) + + hidden_state, _ = self.norm(hidden_state, residual) + + return hidden_state +``` + +### KVCacheManager + +Since KVCache is usually used to optimize LLMs, to use KVCache with FlashAttention and lashPagedAttention provided by MindSpore, some parameters need to be specified additionally, including: + +- **k_cache & v_cache**: The kv_cache object can be considered as a cache table, which is used to store the keys and values in the previous iteration. In the next iteration, these values can be directly read, avoiding repeated computation of the keys and values of the first *n* words, thereby improving performance. + +- **block_tables & slot_mapping**: PagedAttention stores KVCache by block using a mechanism similar to paging, so that the same words can be concentrated in the same block, thereby improving graphics memory utilization. + +According to the preceding description, these parameters can be encapsulated in a management class. The code can be referenced as follows: + +```python +import math +from collections import deque +from mindspore import nn, ops, mint, Parameter, Tensor, mutable + +class CacheManager: + def __init__(self, config: Qwen2Config, block_num: int, block_size: int, batch_size: int) -> None: + self.block_num = block_num + self.block_size = block_size + self.batch_size = batch_size + + head_dim = config.hidden_size // config.num_attention_heads + + self.k_caches = mutable([ops.zeros((block_num, block_size, config.num_key_value_heads, head_dim), dtype=config.param_dtype) for _ in range(config.num_hidden_layers)]) + self.v_caches = mutable([ops.zeros((block_num, block_size, config.num_key_value_heads, head_dim), dtype=config.param_dtype) for _ in range(config.num_hidden_layers)]) + self.block_tables = [[] for _ in range(batch_size)] + self.acc_slot_mapping = [[] for _ in range(batch_size)] + self.free_block_ids = deque(range(block_num)) + + def step(self, start_pos_idx: int, token_num_per_batch: int) -> Tuple[Tensor, Tensor]: + for i in range(self.batch_size): + block_table = self.block_tables[i] + total_block_num = math.ceil((start_pos_idx + token_num_per_batch) / self.block_size) + now_block_num = len(block_table) + for _ in range(total_block_num - now_block_num): + block_id = self.free_block_ids.popleft() + block_table.append(block_id) + start_slot_id = block_id * self.block_size + self.acc_slot_mapping[i].extend(list(range(start_slot_id, start_slot_id + self.block_size))) + + + now_block_tables = Tensor(self.block_tables, dtype=dtype.int32) + now_slot_mapping = Tensor([self.acc_slot_mapping[i][start_pos_idx: start_pos_idx + token_num_per_batch] + for i in range(self.batch_size)], dtype=dtype.int32).view(-1) + + return now_block_tables, now_slot_mapping +``` + +### Sampler + +After the backbone network is computed, the network output is a vocabulary with **shape** in the range [*batch_size*,*vocab_size*], which indicates the probability distribution of the next word in multiple inference requests in the batch. You need to select a word from the vocabulary as the final result. To simplify the selection and eliminate randomness, you need to select the word with the maximum probability as the output each time, that is, perform argmax computing. The following is a code example: + +```python +from mindspore import Tensor + +def sample(logits: Tensor) -> Tensor: + next_token = logits.argmax(axis=-1, keepdims=True) + return next_token +``` + +## Converting Dynamic Graphs to Static Graphs + +MindSpore can convert dynamic graphs to static graphs using JIT to improve inference performance. In terms of code implementation, you can use the following simple decorator for conversion: + +```python +from mindspore import nn, ops, mint, Parameter, Tensor, jit + + +class Qwen2Model(nn.Cell): + def __init__(self, config: Qwen2Config) -> None: + super().__init__() + + self.vocab_size = config.vocab_size + self.hidden_size = config.hidden_size + self.num_hidden_layers = config.num_hidden_layers + + self.embed_tokens = VocabEmbedding(config=config) + self.layers = nn.CellList() + for i in range(config.num_hidden_layers): + layer = Qwen2DecoderLayer(config=config) + self.layers.append(layer) + self.norm = RmsNorm(config=config) + + @jit(jit_level="O0", infer_boost="on") + def construct(self, input_ids: Tensor, positions: Tensor, batch_valid_length: Tensor, + is_prefill: bool, k_caches: List[Tensor], v_caches: List[Tensor], + slot_mapping: Tensor, block_tables: Tensor, attn_mask: Tensor, + q_seq_lens: Tensor) -> Tensor: + hidden_state = self.embed_tokens(input_ids) + residual = None + + for i in range(self.num_hidden_layers): + layer = self.layers[i] + hidden_state, residual = layer(hidden_state, residual, positions, batch_valid_length, + is_prefill, i, k_caches[i], v_caches[i], slot_mapping, + block_tables, attn_mask, q_seq_lens) + + hidden_state, _ = self.norm(hidden_state, residual) + + return hidden_state +``` + +Add the mindspore.jit decorator to the construct method of nn.Cell to execute the computation of the cell in static graph mode. The parameters are described as follows: + +- **jit_level**: specifies the compilation level. Currently, MindSpore inference supports O0 and O1 levels (some operator fusion optimization is involved). + +- **infer_boost**: enables inference acceleration optimization. After this option is enabled, some scheduling optimization and stream optimization are performed during runtime to improve inference performance. + +In addition, due to the limitations of the static graph mode of MindSpore, dynamic-to-static conversion may fail in some scenarios. The following lists some common causes: + +- **setattrs usage**: The setattrs syntax of Python is not supported during MindSpore graph capture. Therefore, parameters cannot be encapsulated using an encapsulation class. For example, Qwen2ModelInput in the preceding example cannot be directly passed to Qwen2Model whose graph is converted to a static graph. Otherwise, the static graph execution fails. + +- **List value**: If there are list parameters when the graph is converted to a static graph, the parameters must be wrapped by mutable to ensure that MindSpore can correctly process the parameters, for example, **k_caches** and **v_caches** in the preceding example. Otherwise, the fallback to Python is triggered, which affects the inference performance. In some scenarios, the computation may fail. + +- **Graph input name**: If the PagedAttention operator of MindSpore is used, the two graph inputs must be named **batch_valid_length** and **q_seq_lens**. Otherwise, the PagedAttention operator fails to be initialized. + +If you plan to use static graph inference in the future when developing models with MindSpore, you are advised to pay attention to the preceding limitations during dynamic graph development and debugging to avoid extra costs in subsequent migration and debugging. diff --git a/tutorials/source_en/model_infer/ms_infer/ms_infer_parallel_infer.md b/tutorials/source_en/model_infer/ms_infer/ms_infer_parallel_infer.md new file mode 100644 index 0000000000000000000000000000000000000000..45b6144a7d105605438004ed23de46b16458aef9 --- /dev/null +++ b/tutorials/source_en/model_infer/ms_infer/ms_infer_parallel_infer.md @@ -0,0 +1,941 @@ +# Building a Parallel LLM Network + +[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/master/tutorials/source_en/model_infer/ms_infer/ms_infer_parallel_infer.md) + +As model sizes continue to expand, the computing resources required by LLMs, particularly graphics memory, are growing exponentially. For example, the Qwen2-72B requires approximately 144 GB of graphics memory at half-precision (FP16). + +In addition, the increasing sequence length of LLMs places immense pressure on graphics memory. Graphics memory not only affects model loading, but also limits the batch size. A small batch size may reduce the inference efficiency, which in turn affects the throughput of the entire system. + +The pressure on graphics memory makes it challenging for a single device to complete inference tasks within a reasonable time frame, and parallel computing has become a key strategy to address this challenge. This section uses the network structure of a common LLM as an example to analyze the model parallelism solution. + +## Model Parallelism Requirement Analysis + +Before performing model sharding and parallelism, you need to analyze the parallelism based on the model structure to determine which layers can be parallelized and how to divide the model to achieve better performance acceleration. To achieve better acceleration, the parallelized part needs to be computed separately, minimizing the impact on other parts. The following uses the Qwen2 model structure as an example to analyze the parallelism of the main network structure: + +- **Embedding**: The embedding layer is actually a gather operation and can be parallelized properly regardless of the sharding dimension (**hidden_dim** or **num_embeddings**). Because **all_reduce** (reducing overheads of data arrangement) can be better performed based on **num_embedding**, sharding is performed based on the **num_embeddings** dimension. + +- **Attention**: The Qwen2 model uses the attention computation method of GQA, that is, multiple independent attention computations. Therefore, the query, key, and value can be parallelized separately by column. However, the number of shards must be exactly divided by the number of attention heads. + +- **MLP**: The MLP layer is actually a matrix multiplication of two Linear layers, which can be sharded by block. + +- **RmsNorm&Add**: RmsNorm needs to normalize a row of data, which requires global information. Therefore, RmsNorm cannot be effectively parallelized. In this case, you need to use all_reduce to summarize data and then compute data. In addition, Add and RmsNorm usually used together and cannot be sharded. + +- **LMHead**: The LMHead layer is actually a Linear layer. The input **shape** is usually (*batch_size*, *hidden_size*) multiplied by (*hidden_size*, *vocab_size*). You can perform sharding by **vocab_size** and combine them using all_gather for acceleration. + +The following figure shows the execution of one Qwen2 layer with a parallelism degree of 2. + +![matmul1](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/tutorials/source_zh_cn/model_infer/ms_infer/images/llm_qwen2_parallel_split.png) + +As shown in the figure, RmsNorm cannot be sharded. Therefore, an AllReduce operator needs to be added to the network before each RmsNorm computing to synchronize the computing results of each subprocess. The result after RmsNorm is usually **hidden_states**. Therefore, the result can be sharded by column-wise Linear and allocated to each subprocess for computing and then normalized by RowLinear. + +## Model Module Parallelism Solution + +The Linear layer is the main network layer for sharding, and its core is MatMul (matrix computation). Therefore, matrix sharding and computation is the most important part of model parallelism. + +### Basic MatMul Module + +![matmul1](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/tutorials/source_zh_cn/model_infer/ms_infer/images/gmm.png) + +![matmul2](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/tutorials/source_zh_cn/model_infer/ms_infer/images/matmul.png) + +In LLM computations, matrix multiplication (MatMul) accounts for a significant portion of both weight and computation workload. MatMul exhibits both column-wise parallelism and row-wise parallelism. + +![Column-wise Parallelism](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/tutorials/source_zh_cn/model_infer/ms_infer/images/column.png) + +![Row-wise Parallelism](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/tutorials/source_zh_cn/model_infer/ms_infer/images/row.png) + +Starting with the original implementation of `nn.Dense` in MindSpore, we can build implementations for both column-wise and row-wise MatMul. + +1. Creation and management of communication domains and management of LLM configurations + + Build the `CommunicationHelper` class to manage the model parallel domain. + + ```python + from mindspore.communication import create_group, get_group_size, get_rank + ``` + + ```python + class CommunicationHelper: + def __init__(self, group_name, size): + self.group_name = group_name + self.size = size + self.rank_list = [i for i in range(size)] + + def create_tensor_model_parallel_group(self): + create_group(group=self.group_name, rank_ids=self.rank_list) + + def get_tensor_model_parallel_group_size(self): + return get_group_size(group=self.group_name) + + def get_tensor_model_parallel_group_rank(self): + return get_rank(group=self.group_name) + + def get_tensor_model_parallel_group(self): + return self.group_name + ``` + + Build `ConfigHelper` to manage and configure LLM parameters. + + ```python + class ConfigHelper: + def __init__(self, + vocab_size, + hidden_size, + ffn_hidden_size, + num_layers, + batch_size, + seq_length, dtype, + num_heads, + has_bias=False): + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.ffn_hidden_size = ffn_hidden_size + self.num_layers = num_layers + self.batch_size = batch_size + self.seq_length = seq_length + self.dtype = dtype + self.num_heads = num_heads + self.has_bias = has_bias + ``` + +2. Column-wise MatMul + + The `ColumnParallelLinear` class computes the weight shape after sharding and initializes the weights based on the number of devices for model parallelism. Column-wise parallelism divides `out_channels`. In the model forward propagation process, the MatMul is called to compute the result after parallelism. You can perform `AllGather` on the parallelized result to obtain the complete output. + + The MindSpore training and inference integrated framework supports enabling **infer_boost**. This parameter activates the high-performance self-developed operator library within the MindSpore framework. To enable this mode, you need to: + + 1. Set variables. + + ```python + from mindspore import set_context + set_context(jit_config={"jit_level": 'O0', "infer_boost": 'on'}) + ``` + + 2. Set system environment variables. + + ```bash + export ASCEND_HOME_PATH={$ascend_custom_path} + ``` + + For example, if there are 2 devices for model parallelism, set environment variables, initialize the communication group, and configure the model parameter **config** as follows: + + ```python + from mindspore import nn, Parameter, ops, Tensor + from mindspore.common import dtype as mstype + from mindspore.communication import init + from mindspore.common.initializer import initializer + import numpy as np + + from mindspore import set_context + set_context(jit_config={"jit_level": 'O0', "infer_boost": 'on'}) + + TP_GROUP_NAME='tp' + TP_SIZE = 2 + COMMUN_HELPER = CommunicationHelper(group_name=TP_GROUP_NAME, size=TP_SIZE) + + init() + COMMUN_HELPER.create_tensor_model_parallel_group() + + config = ConfigHelper(batch_size=64, + vocab_size=32000, + num_layers=4, + seq_length=2048, + hidden_size=1024, + ffn_hidden_size=4096, + dtype=mstype.float16, + num_heads=8, + has_bias=False) + ``` + + The column-wise MatMul module is implemented as follows: + + ```python + class ColumnParallelLinear(nn.Cell): + def __init__(self, + in_channels, + out_channels, + weight_init=None, + bias_init=None, + has_bias=True, + dtype=mstype.float32): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.has_bias = has_bias + self.tensor_parallel_group_size = COMMUN_HELPER.get_tensor_model_parallel_group_size() + self.out_channels_per_partition = out_channels // self.tensor_parallel_group_size + self.dtype = dtype + weight_shape = (self.out_channels_per_partition, self.in_channels) + self.weight = Parameter(initializer(weight_init, weight_shape, self.dtype), name="weight") + if self.has_bias: + self.bias = Parameter(initializer(bias_init, (self.out_channels_per_partition), self.dtype), name="bias") + self.bias_add = ops.Add() + self.matmul = ops.BatchMatMul(transpose_b=True) + self.cast = ops.Cast() + + def construct(self, x): + origin_dtype = x.dtype + x = self.cast(x, self.dtype) + out = self.matmul(x, self.weight) + if self.has_bias: + out = self.bias_add( + out, self.cast(self.bias, self.dtype) + ) + out = self.cast(out, origin_dtype) + return out + ``` + + The output of column-wise MatMul is parallelized. To obtain a complete output, use `GatherLastDim`. + + ```python + class GatherLastDim(nn.Cell): + def __init__(self): + super().__init__() + self.all_gather = ops.AllGather(group=COMMUN_HELPER.get_tensor_model_parallel_group()) + self.world_size = COMMUN_HELPER.get_tensor_model_parallel_group_size() + self.split = ops.Split(axis=0, output_num=self.world_size) + + def construct(self, input_): + output = self.all_gather(input_) + tensor_list = self.split(output) + output = ops.cat(tensor_list, axis=-1) + return output + ``` + + Inference of column-wise MatMul: + + ```python + column_parallel_linear = ColumnParallelLinear(in_channels=config.hidden_size, + out_channels=config.hidden_size, + weight_init='normal', + dtype=config.dtype, + has_bias=False) + input_x = Tensor(np.random.randn(config.batch_size, config.seq_length, config.hidden_size).astype(np.float32)) + out_parallel = column_parallel_linear(input_x) + print(out_parallel.shape) + + gather_last_dim = GatherLastDim() + out = gather_last_dim(out_parallel) + print(out.shape) + ``` + +3. Row-wise MatMul + + Similar to column-wise MatMul, `RowParallelLinear` shards weights based on the size of the model parallelism domains. During initialization, sharding is performed by row, that is, sharding by `in_channels`. In the model forward propagation process, after the MatMul of the inputs and weights, `AllReduce` needs to be performed on the results of all `devices`. + + The row-wise MatMul module is implemented as follows: + + ```python + class RowParallelLinear(nn.Cell): + def __init__(self, + in_channels, + out_channels, + weight_init='normal', + bias_init=None, + has_bias=True, + dtype=mstype.float32): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.has_bias = has_bias + self.tensor_parallel_group_size = COMMUN_HELPER.get_tensor_model_parallel_group_size() + self.in_channels_per_partition = in_channels // self.tensor_parallel_group_size + self.dtype = dtype + weight_shape = (self.out_channels, self.in_channels_per_partition) + self.weight = Parameter(initializer(weight_init, weight_shape, self.dtype), name="weight") + if self.has_bias: + self.bias = Parameter(initializer(bias_init, (self.in_channels_per_partition), self.dtype), name="bias") + self.bias_add = ops.Add() + self.bmm = ops.BatchMatMul(transpose_b=True) + self.all_reduce = ops.AllReduce(group=COMMUN_HELPER.get_tensor_model_parallel_group()) + self.cast = ops.Cast() + + def construct(self, x): + origin_dtype = x.dtype + x = self.cast(x, self.dtype) + output_parallel = self.bmm(x, self.weight) + if self.has_bias: + output_parallel = self.bias_add(output_parallel, self.cast(self.bias, self.dtype)) + out = self.all_reduce(output_parallel) + out = self.cast(out, origin_dtype) + return out + ``` + + Inference of row-wise MatMul: + + ```python + row_parallel_linear = RowParallelLinear(in_channels=config.hidden_size, + out_channels=config.hidden_size, + weight_init='normal', + dtype=config.dtype, + has_bias=False) + out = row_parallel_linear(out_parallel) + print(out.shape) + ``` + +4. Embedding + + In addition to MatMul, the embedding layer can also be parallelized. The embedding weights can be sharded across multiple devices, with each device responsible for mapping a different range of token IDs. + + ![embedding2](./images/embedding2.png) + + Based on nn.Embedding, build an embedding layer for model parallelism. + + ```python + class VocabParallelEmbedding(nn.Cell): + def __init__(self, + num_embeddings, + embedding_dim, + init_method="normal", + init_type=mstype.float32): + super().__init__() + self.num_embeddings = num_embeddings + self.embedding_dim = embedding_dim + self.tensor_model_parallel_size = COMMUN_HELPER.get_tensor_model_parallel_group_size() + per_partition_vocab_size = self.num_embeddings // self.tensor_model_parallel_size + self.vocab_start_index = COMMUN_HELPER.get_tensor_model_parallel_group_rank() * per_partition_vocab_size + self.vocab_end_index = self.vocab_start_index + per_partition_vocab_size + self.num_embeddings_per_partition = ( + self.vocab_end_index - self.vocab_start_index + ) + self.embedding_weight = Parameter( + initializer( + init=init_method, + shape=(self.num_embeddings_per_partition, self.embedding_dim), + dtype=init_type, + ), + name="embedding_weight", + ) + self.all_reduce = ops.AllReduce(group=COMMUN_HELPER.get_tensor_model_parallel_group()) + self.max_index_per_partition = Tensor(self.num_embeddings_per_partition - 1, dtype=mstype.int32) + self.expand_dims = ops.ExpandDims() + self.gather = ops.Gather() + self.sub = ops.Sub() + self.relu = ops.ReLU() + self.minimum = ops.Minimum() + self.eq = ops.Equal() + self.mul = ops.Mul() + + def construct(self, x): + displaced_x = self.sub(x, self.vocab_start_index) + down_truncated_x = self.relu(displaced_x) + truncated_x = self.minimum(down_truncated_x, self.max_index_per_partition) + input_mask = self.eq(displaced_x, truncated_x) + input_mask = self.expand_dims(input_mask, -1) + output_parallel = self.gather(self.embedding_weight, truncated_x, 0) + output_parallel = self.mul(output_parallel, input_mask) + output = self.all_reduce(output_parallel) + return output + ``` + + Inference of parallel embedding: + + ```python + input_ids = np.random.randint(0, config.vocab_size, size=(config.batch_size, config.seq_length), dtype=np.int32) + input_ids = Tensor(input_ids) + + vocab_parallel_embedding = VocabParallelEmbedding(num_embeddings=config.vocab_size, + embedding_dim=config.hidden_size) + embedding_output = vocab_parallel_embedding(input_ids) + print(embedding_output.shape) + ``` + +### TransformerModel Parallel Adaptation + +It can be seen that the tensor is processed sequentially. First, it passes through the `ColumnParallelLinear` column-wise MatMul to obtain the parallelized results. Then, it is input to the `RowParallelLinear` row-wise MatMul, resulting in the complete output of the two MatMul operations. + +![Column+Row](../../../source_zh_cn/model_infer/ms_infer/images/column+row.png) + +Based on the preceding analysis, TransformerModel can be modified to support parallelism. + +1. Attention + + Take the multi-head attention (MHA) module as an example. The attention module in the Transformer is multi-headed, and attention heads are independent of each other. Therefore, the activation value can be sharded by `hidden_size` while ensuring that a single attention head is complete. For example, assume that the number of MHA headers (`num_heads`) is 16, the number of dimensions (`head_dim`) of each header is 256, then `hidden_size` is 4096, and the number of linear in/out dimensions of Q/K/V is 4096. When `tensor_model_parallel` is set to `4` for the model parallelism, these linear results are allocated to four devices. The shape of each device is (4096,1024), indicating that each device computes 4 heads. + + ![MHA](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/tutorials/source_zh_cn/model_infer/ms_infer/images/MHA.png) + + The following is an example of the Attention module code: + + ```python + class ParallelAttention(nn.Cell): + def __init__(self, config): + super().__init__() + self.tensor_model_parallel_size = COMMUN_HELPER.get_tensor_model_parallel_group_size() + self.num_heads_per_partition = config.num_heads // self.tensor_model_parallel_size + self.head_dim = config.hidden_size // config.num_heads + self.norm_factor = math.sqrt(self.head_dim) + self.q = ColumnParallelLinear(in_channels=config.hidden_size, + out_channels=config.hidden_size, + weight_init='normal', + has_bias=config.has_bias) + self.k = ColumnParallelLinear(in_channels=config.hidden_size, + out_channels=config.hidden_size, + weight_init='normal', + dtype=config.dtype, + has_bias=config.has_bias) + self.v = ColumnParallelLinear(in_channels=config.hidden_size, + out_channels=config.hidden_size, + weight_init='normal', + dtype=config.dtype, + has_bias=config.has_bias) + self.flash_attention = ops.operations.nn_ops.FlashAttentionScore(head_num=self.num_heads_per_partition, + scale_value=1.0/self.norm_factor, + next_tokens=0) + self.out = RowParallelLinear(in_channels=config.hidden_size, + out_channels=config.hidden_size, + weight_init='normal', + dtype=config.dtype, + has_bias=config.has_bias) + + def construct(self, x, mask): + query = self.q(x) + key = self.k(x) + value = self.v(x) + _, _, _, context_layer = self.flash_attention(query, key, value, attn_mask=mask) + output = self.out(context_layer) + return output + ``` + +2. MLP + + The MLP module is two fully-connected layers, which can also be processed by parallel MatMul. The code is as follows: + + ```python + class ParallelMLP(nn.Cell): + def __init__(self, config): + super().__init__() + self.w1 = ColumnParallelLinear(in_channels=config.hidden_size, + out_channels=config.ffn_hidden_size, + weight_init='normal', + dtype=config.dtype, + has_bias=config.has_bias) + self.w2 = RowParallelLinear(in_channels=config.ffn_hidden_size, + out_channels=config.hidden_size, + weight_init='normal', + dtype=config.dtype, + has_bias=config.has_bias) + self.act_func = nn.SiLU() + self.mul = ops.Mul() + + def construct(self, x): + x = self.w1(x) + x = self.act_func(x) + output = self.w2(x) + return output + ``` + +3. TransformerLayer + + TransformerLayer consists of Attention and MLP. Since there are no single operators that can be parallelized, you only need to pass the parallel parameters to Attention and MLP. + + ```python + class ParallelTransformerLayer(nn.Cell): + def __init__(self, config): + super().__init__() + self.attention = ParallelAttention(config=config) + self.feed_forward = ParallelMLP(config=config) + self.attention_norm = RMSNorm(dim=config.hidden_size, dtype=config.dtype) + self.ffn_norm = RMSNorm(dim=config.hidden_size, dtype=config.dtype) + self.add = ops.Add() + + def construct(self, x, mask): + norm_output = self.attention_norm(x) + attention_output = self.attention(norm_output, mask) + norm_input = self.add(x, attention_output) + norm_output = self.ffn_norm(norm_input) + mlp_output = self.feed_forward(norm_output) + output = self.add(norm_input, mlp_output) + return output + ``` + +4. TransformerModel + + ```python + class ParallelTransformer(nn.Cell): + def __init__(self, config): + super().__init__() + self.embedding = VocabParallelEmbedding(num_embeddings=config.vocab_size, + embedding_dim=config.hidden_size, + init_method='normal', + init_type=config.dtype) + self.layers = nn.CellList() + self.num_layers = config.num_layers + for _ in range(config.num_layers): + layer = ParallelTransformerLayer(config=config) + self.layers.append(layer) + self.norm_out = RMSNorm(dim=config.hidden_size, dtype=config.dtype) + + def construct(self, x, mask): + hidden_state = self.embedding(x) + for i in range(self.num_layers): + hidden_state = self.layers[i](hidden_state, mask) + hidden_state = self.norm_out(hidden_state) + return hidden_state + ``` + +For details about the end-to-end LLM code project, see the [model_dev.py](https://gitee.com/mindspore/docs/blob/master/docs/sample_code/infer_code/model_dev.py) script. Run the following command to verify the code: + +```shell +msrun --worker_num 2 --local_worker_num 2 --master_port 8124 --log_dir msrun_log --join True --cluster_time_out 300 model_dev.py +``` + +## Practice: Qwen2 Model Parallel Reconstruction + +This section describes how to adapt the Qwen2 LLM developed in [Building an LLM Inference Network from Scratch](./ms_infer_network_develop.md) to parallel processing. Based on the preceding analysis, parallel adaptation can be divided into the following two main steps: + +1. **Model network adaptation**: Based on the preceding parallelism solution, parallelize the network layers in the model and allocate the computation workloads to multiple cards. + +2. **Model weight adaptation**: Modify the weights accordingly when the model weights are loaded because the shape of the weights in Linear changes after parallel sharding. + +To simplify the scenario, this section shards only the Linear layer of the Qwen2 model with a parallelism degree of 2. Currently, the sharding of the embedding layer is not involved. + +### Establishing a Communication Group + +Before reconstructing the model, you need to use the communication module of MindSpore to establish a communication group to implement subsequent communication operations. This function can be directly implemented using the CommunicationHelper class described above. The following code can be used to implement this function: + +```python +from mindspore.communication import create_group, get_group_size, get_rank, init + +class CommunicationHelper: + def __init__(self, group_name: str, size: int) -> None: + self.group_name = group_name + self.size = size + self.rank_list = [i for i in range(size)] + + def create_tensor_model_parallel_group(self): + create_group(group=self.group_name, rank_ids=self.rank_list) + + def get_tensor_model_parallel_group_size(self): + return get_group_size(group=self.group_name) + + def get_tensor_model_parallel_group_rank(self): + return get_rank(group=self.group_name) + + def get_tensor_model_parallel_group(self): + return self.group_name + +COMMON_HELPER = None + +def init_communication(): + TP+GROUP_NAME = "tp" + TP_SIZE = 2 + + global COMMON_HELPER + COMMON_HELPER = CommunicationHelper(group_name=TP_GROUP_NAME, size=TP_SIZE) + init() + COMMON_HELPER.create_tensor_model_parallel_group() +``` + +### Model Sharding and Parallelism + +This solution mainly performs sharding and parallelism on the Linear layer. Therefore, the Linear layer is modified mainly. In the implementation, Qwen2Linear needs to be changed to Qwen2ColParallelLinear and Qwen2RowParallelLinear, which correspond to the Linear layer of column sharding and row sharding, respectively. For details, see the following code: + +```diff +from typing import Optional, Type, Tuple + +from mindspore import nn, ops, mint, Parameter, Tensor + +class Qwen2ColParallelLinear(nn.Cell): + def __init__(self, input_size: int, output_size: int, param_dtype: Optional[Type], bias: bool) -> None: + super().__init__() + ++ self.tp_size = COMMON_HELPER.get_tensor_model_parallel_group_size() + self.param_dtype = param_dtype + self.input_size = input_size +- self.output_size = output_size ++ self.output_size = output_size // self.tp_size + self.enable_bias = bias + + self.matmul = ops.MatMul(transpose_b=True) + self.weight = Parameter( + mint.zeros( + (self.output_size, self.input_size), + dtype=self.param_dtype + ), requires_grad=False + ) + + if self.enable_bias: + self.bias_add = ops.Add() + self.bias = Parameter( + mint.zeros(self.output_size, dtype=self.param_dtype) + ) + + def construct(self, input: Tensor) -> Tuple[Tensor, bool]: + origin_shape = input.shape + x = self.matmul(input.view(-1, origin_shape[-1]), self.weight) + if self.enable_bias: + x = self.bias_add(x, self.bias) + return x.view(*origin_shape[:-1], -1) + + +class Qwen2RowParallelLinear(nn.Cell): + def __init__(self, input_size: int, output_size: int, param_dtype: Optional[Type], bias: bool) -> None: + super().__init__() + ++ self.tp_size = COMMON_HELPER.get_tensor_model_parallel_group_size() + self.param_dtype = param_dtype +- self.input_size = input_size ++ self.input_size = input_size // self.tp_size + self.output_size = output_size + self.enable_bias = bias + + self.matmul = ops.MatMul(transpose_b=True) + self.weight = Parameter( + mint.zeros( + (self.output_size, self.input_size), + dtype=self.param_dtype + ), requires_grad=False + ) + + if self.enable_bias: + self.bias_add = ops.Add() + self.bias = Parameter( + mint.zeros(self.output_size, dtype=self.param_dtype) + ) ++ self.all_reduce = ops.AllReduce(group=COMMON_HELPER.get_tensor_model_parallel_group()) + + def construct(self, input: Tensor) -> Tuple[Tensor, bool]: + origin_shape = input.shape + x = self.matmul(input.view(-1, origin_shape[-1]), self.weight) + if self.enable_bias: + x = self.bias_add(x, self.bias) ++ x = self.all_reduce(x) + return x.view(*origin_shape[:-1], -1) +``` + +As shown in the preceding code, the Linear reconstruction is simple. Qwen2ColParallelLinear only needs to shard the output dimension based on the parallelism degree, and Qwen2RowParallelLinear only needs to shard the input dimension based on the parallelism degree. Because all_reduce computation is required after row sharding, an all_reduce operation is added to Qwen2RowParallelLinear. + +In addition, the original Qwen2Linear layer needs to be changed to a new Linear layer based on the algorithm. Pay attention to the following three parts: + +- **Attention**: Four Linear layers are involved, including query, key, value, and output. The query, key, and value layers need to be replaced by Qwen2ColParallelLinear, and the output layer needs to be replaced by Qwen2RowParallelLinear. + +- **MLP**: Three Linear layers are involved, including gate, up, and down. The gate and up layers need to be replaced by Qwen2ColParallelLinear, and the down layer needs to be replaced by Qwen2RowParallelLinear. + +- **LMHead**: A Linear layer is involved. Since there is no row-wise Linear layer corresponding to it, the all_gather operation is required to obtain the results of multiple devices. + +You can replace the class objects to complete the following modifications and adaptations. The following lists the modified network layer implementation: + +```diff +import numpy as np +from typing import Optional, Type + +from mindspore import nn, ops, mint, Parameter, Tensor + + +class Qwen2Attention(nn.Cell): + def __init__(self, config: Qwen2Config) -> None: + super().__init__() + ++ self.tp_size = COMMON_HELPER.get_tensor_model_parallel_group_size() + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.num_kv_heads = config.num_key_value_heads + self.head_dim =config.hidden_size // self.num_heads + self.q_size = self.head_dim * self.num_heads + self.kv_size = self.head_dim * self.num_kv_heads + self.scaling = float(self.head_dim ** -0.5) + self.rope_theta = int(config.rope_theta) + self.param_dtype = config.param_dtype + self.max_position = config.max_position_embeddings + +- self.flash_attn = FlashAttention(self.scaling, self.num_heads) +- self.paged_attn = PagedAttention(self.num_heads, self.scaling, self.num_kv_heads) ++ self.flash_attn = FlashAttention(self.scaling, self.num_heads // self.tp_size) ++ self.paged_attn = PagedAttention(self.num_heads // self.tp_size, self.scaling, self.num_kv_heads // self.tp_size) + self.reshape_and_cache = ops.auto_generate.ReshapeAndCache() + + self.q_proj = Qwen2ColParallelLinear( + input_size=self.hidden_size, + output_size=self.q_size, + param_dtype=self.param_dtype + bias=True + ) + self.k_proj = Qwen2ColParallelLinear( + input_size=self.hidden_size, + output_size=self.kv_size, + param_dtype=self.param_dtype, + bias=True + ) + self.v_proj = Qwen2ColParallelLinear( + input_size=self.hidden_size, + output_size=self.kv_size, + param_dtype=self.param_dtype, + bias=True + ) + self.o_proj = Qwen2RowParallelLinear( + input_size=self.q_size, + output_size=self.hidden_size, + param_dtype=self.param_dtype, + bias=False + ) + + self.rotary_emb = Qwen2RotaryEmbedding( + head_size=self.head_dim, + rotary_dim=self.head_dim, + max_position_embeddings=self.max_position, + base=self.rope_theta, + dtype=self.param_dtype + ) + + def construct(self, hidden_state: Tensor, positions: Tensor, batch_valid_length: Tensor, + is_prefill, bool, layer_idx: int, k_cache: Tensor, v_cache: Tensor, + slot_mapping: Tensor, block_tables: Tensor, attn_mask: Tensor, + q_seq_lens: Tensor) -> Tensor: + bs, seq_len, hidden_dim = hidden_state.shape + +- q = self.q_proj(hidden_state).view(-1, self.q_size // self.tp_size) +- k = self.k_proj(hidden_state).view(-1, self.kv_size // self.tp_size) +- v = self.v_proj(hidden_state).view(-1, self.kv_size // self.tp_size) ++ q = self.q_proj(hidden_state).view(-1, self.q_size // self.tp_size) ++ k = self.k_proj(hidden_state).view(-1, self.kv_size // self.tp_size) ++ v = self.v_proj(hidden_state).view(-1, self.kv_size // self.tp_size) + + k = k.contiguous() + v = v.contiguous() + + cache_out = self.reshape_and_cache( + k, + v, + k_cache, + v_cache, + slot_mapping + ) + q = ops.depend(q, cache_out) + + if is_prefill: + attn_output = self.flash_attn( + q, + k, + v, + attn_mask, + batch_valid_length + ) + else: + attn_output = self.paged_attn( + q, + k_cache, + v_cache, + block_tables, + batch_valid_length, + attn_mask, + q_seq_lens + ) + + output = self.o_proj(attn_output).view(bs, seq_len, -1) + return output + +class Qwen2MLP(nn.Cell): + def __init__(self, config: Qwen2Config) -> None: + super().__init__() + + self.up_proj = Qwen2ColParallelLinear( + input_size=config.hidden_size, + output_size=config.intermediate_size, + param_dtype=config.param_dtype, + bias=False + ) + self.gate_proj = Qwen2ColParallelLinear( + input_size=config.hidden_size, + output_size=config.intermediate_size, + param_dtype=config.param_dtype, + bias=False + ) + self.down_proj = Qwen2RowParallelLinear( + input_size=config.intermediate_size, + output_size=config.hidden_size, + param_dtype=config.param_dtype, + bias=False + ) + self.act_fn = ops.silu + + def construct(self, x: Tensor) -> Tensor: + output = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + return output + ++class GatherLastDim(nn.Cell): ++ def __init__(self): ++ self.all_gather = ops.AllGather(group=COMMON_HELPER.get_tensor_model_parallel_group()) ++ self.world_size = COMMON_HELPER.get_tensor_model_parallel_group_size() ++ self.split = ops.Split(axis=0, output_num=self.world_size) ++ ++ def construct(self, input: Tensor) -> Tensor: ++ output = self.all_gather(input) ++ tensor_list = self.split(output) ++ output = ops.cat(tensor_list, axis=-1) ++ return output + +class Qwen2ForCausalLM(nn.Cell): + def __init__(self, config: Qwen2Config) -> None: + super().__init__() + + self.model = Qwen2Model(config=config) + self.lm_head = Qwen2ColParallelLinear( + input_size=config.hidden_size, + output_size=config.vocab_size, + param_dtype=config.param_dtype, + bias=False + ) ++ self.all_gather = GatherLastDim() + + def load_weight(self, weight_path: str) -> None: + weight_dict = {} + for path in glob(weight_path + "/*.safetensors"): + weight_dict.update(ms.load_checkpoint(path, format="safetensors")) + + ms.load_param_into_net(self, weight_dict, strict_load=False) + + def construct(self, model_input: Qwen2ModelInput) -> Tensor: + hidden_state = self.model(model_input.input_ids, model_input.positions, + model_input.batch_valid_length, model_input.is_prefill, + model_input.k_caches, model_input.v_caches, model_input.slot_mapping, + model_input.block_tables, model_input.attn_mask, model_input.q_seq_len) + logits = self.lm_head(hidden_state)[:, -1] ++ logits = self.all_gather(logits) + return logits +``` + +The code implementation changes slightly. Note that the query, key, and value in attention are sharded based on the heads of attention. Therefore, the input and output dimensions of FlashAttention and PagedAttention need to be divided by the degree of parallelism to narrow down the calculation scope; in addition, ensure that the degree of parallelism can be exactly divided by the number of heads of the query, key, and value. + +### Model Weight Sharding + +The original Qwen2ForCausalLM uses the load_param_into_net function provided by MindSpore to inject weights into the model. The logic is to load the original weights. After the model is sharded, the model to be loaded also needs to be adapted, and the size needs to be changed. Processes on non-zero cards need to read data based on the offset. Therefore, the load_weight function needs to be modified to implement weight loading in parallel mode. + +You are advised to register the loading function in the weight parameter. For details, see the following code: + +```diff +from typing import Optional, Type, Tuple + +from mindspore import nn, ops, mint, Parameter, Tensor + +class Qwen2ColParallelLinear(nn.Cell): + def __init__(self, input_size: int, output_size: int, param_dtype: Optional[Type], bias: bool) -> None: + super().__init__() + + self.tp_size = COMMON_HELPER.get_tensor_model_parallel_group_size() + self.param_dtype = param_dtype + self.input_size = input_size + self.output_size = output_size // self.tp_size + self.enable_bias = bias + + self.matmul = ops.MatMul(transpose_b=True) + self.weight = Parameter( + mint.zeros( + (self.output_size, self.input_size), + dtype=self.param_dtype + ), requires_grad=False + ) ++ setattr(self.weight, "weight_load", self.weight_load) + + if self.enable_bias: + self.bias_add = ops.Add() + self.bias = Parameter( + mint.zeros(self.output_size, dtype=self.param_dtype) + ) ++ setattr(self.bias, "weight_load", self.weight_load) + + def construct(self, input: Tensor) -> Tuple[Tensor, bool]: + origin_shape = input.shape + x = self.matmul(input.view(-1, origin_shape[-1]), self.weight) + if self.enable_bias: + x = self.bias_add(x, self.bias) + return x.view(*origin_shape[:-1], -1) + ++ def weight_load(self, param: Tensor, weight: Tensor) -> None: ++ tp_rank = COMMON_HELPER.get_tensor_model_parallel_group_rank() ++ copy_dim = 0 ++ shard_size = param.shape[copy_dim] ++ start_idx = tp_rank * shard_size ++ weight = weight.narrow(copy_dim, start_idx, shard_size).contiguous() ++ ++ param.set_data(weight) ++ return None + + + +class Qwen2RowParallelLinear(nn.Cell): + def __init__(self, input_size: int, output_size: int, param_dtype: Optional[Type], bias: bool) -> None: + super().__init__() + + self.tp_size = COMMON_HELPER.get_tensor_model_parallel_group_size() + self.param_dtype = param_dtype + self.input_size = input_size // self.tp_size + self.output_size = output_size + self.enable_bias = bias + + self.matmul = ops.MatMul(transpose_b=True) + self.weight = Parameter( + mint.zeros( + (self.output_size, self.input_size), + dtype=self.param_dtype + ), requires_grad=False + ) ++ setattr(self.weight, "weight_load", self.weight_load) + + if self.enable_bias: + self.bias_add = ops.Add() + self.bias = Parameter( + mint.zeros(self.output_size, dtype=self.param_dtype) + ) ++ setattr(self.bias, "weight_load", self.weight_load) + self.all_reduce = ops.AllReduce(group=COMMON_HELPER.get_tensor_model_parallel_group()) + + def construct(self, input: Tensor) -> Tuple[Tensor, bool]: + origin_shape = input.shape + x = self.matmul(input.view(-1, origin_shape[-1]), self.weight) + if self.enable_bias: + x = self.bias_add(x, self.bias) + x = self.all_reduce(x) + return x.view(*origin_shape[:-1], -1) + ++ def weight_load(self, param: Tensor, weight: Tensor) -> None: ++ tp_rank = COMMON_HELPER.get_tensor_model_parallel_group_rank() ++ copy_dim = 1 ++ shard_size = param.shape[copy_dim] ++ start_idx = tp_rank * shard_size ++ weight = weight.narrow(copy_dim, start_idx, shard_size).contiguous() ++ ++ param.set_data(weight) ++ return None + +class Qwen2ForCausalLM(nn.Cell): + def __init__(self, config: Qwen2Config) -> None: + super().__init__() + + self.model = Qwen2Model(config=config) + self.lm_head = Qwen2ColParallelLinear( + input_size=config.hidden_size, + output_size=config.vocab_size, + param_dtype=config.param_dtype, + bias=False + ) + self.all_gather = GatherLastDim() + + def load_weight(self, weight_path: str) -> None: + weight_dict = {} + for path in glob(weight_path + "/*.safetensors"): + weight_dict.update(ms.load_checkpoint(path, format="safetensors")) + +- ms.load_param_into_net(self, weight_dict, strict_load=False) ++ param_dict = self.parameters_dict() ++ ++ for (name, weight) in weight_dict.items(): ++ if name in param_dict: ++ param = param_dict[name] ++ if hasattr(param, "weight_load"): ++ weight_load = getattr(param, "weight_load") ++ weight_load(param, weight) ++ else: ++ param.set_data(weight) +``` + +The weight_load method is added to the network layer that requires user-defined weight loading. The user-defined weight loading method is set for the weight object by using the setattr method. During model weight loading, the corresponding parameter object is found by reading the weight mapping table, so as to update the weights. For the column-wise or row-wise Linear layer, the narrow method of Tensor is used to obtain the data with the corresponding offset. The only difference is that the sharding dimensions are different. + +### Parallel Execution + +After the model adaptation and weight adaptation are complete, you can run the following command to start multi-device execution: + +```shell +msrun --worker_num 2 --local_worker_num 2 --master_port 8124 --log_dir msrun_log --join True --cluster_time_out 300 infer_parallel.py +``` + +**infer_parallel.py** is the inference script. diff --git a/tutorials/source_en/model_infer/ms_infer/ms_infer_quantization.md b/tutorials/source_en/model_infer/ms_infer/ms_infer_quantization.md new file mode 100644 index 0000000000000000000000000000000000000000..5d20e83bcf527e264f1e56ea896c192f40e74057 --- /dev/null +++ b/tutorials/source_en/model_infer/ms_infer/ms_infer_quantization.md @@ -0,0 +1,204 @@ +# Model Quantization + +[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/master/tutorials/source_en/model_infer/ms_infer/ms_infer_quantization.md) + +## Overview + +MindSpore is an all-scenario AI framework. When a model is deployed on the device or other lightweight devices, it may be subject to memory, power consumption, and latency. Therefore, the model needs to be compressed before deployment. + +[MindSpore Golden Stick](https://www.mindspore.cn/golden_stick/docs/en/master/index.html) provides the model compression capability of MindSpore. MindSpore Golden Stick is a set of model compression algorithms jointly designed and developed by Huawei Noah's Ark team and Huawei MindSpore team. It provides a series of model compression algorithms for MindSpore, supporting quantization modes such as A16W8, A16W4, A8W8, and KVCache. For details, see [MindSpore Golden Stick](https://www.mindspore.cn/golden_stick/docs/en/master/index.html). + +## Basic Model Quantization Process + +To help you understand the basic model quantization process of MindSpore Golden Stick, this section uses the quantization algorithm as an example to describe the basic usage. + +### Procedure + +The MindSpore Golden Stick quantization algorithm can be divided into two phases: quantization phase and deployment phase. The quantization phase is completed before deployment. The main tasks are as follows: collecting weight distribution, computing quantization parameters, quantizing weight data, and inserting dequantization nodes. The deployment phase refers to the process of using the MindSpore framework to perform inference on the quantized model in the production environment. + +MindSpore Golden Stick mainly uses `PTQConfig` to customize quantization and deployment, and uses the `apply` and `convert` APIs to implement quantization and deployment. You can configure whether to quantize the weight, activation, and KVCache, and configure the quantization bit in `PTQConfig`. In addition, you can configure the data calibration policy. For details, see [PTQConfig Description](#ptqconfig-description). + +The quantization procedure of MindSpore Golden Stick is as follows: + +```python +import numpy as np +import mindspore as ms +import mindspore.nn as nn +from mindspore import Tensor, dtype +from mindformers.modules import Linear +from mindspore_gs.common import BackendTarget +from mindspore_gs.ptq import PTQMode, PTQConfig +from mindspore_gs.ptq.ptq import PTQ +from mindspore.dataset import GeneratorDataset + +class SimpleNet(nn.Cell): + class DecoderCell(nn.Cell): + def __init__(self, linear): + super().__init__() + self.linear = linear + + def construct(self, *args, **kwargs): + return self.linear(*args, **kwargs) + + def __init__(self, foo_seq_length=1024): + super().__init__() + + self.foo_seq_length = foo_seq_length + linear = Linear(in_channels=foo_seq_length, out_channels=foo_seq_length, weight_init="ones") + self.decoder = SimpleNet.DecoderCell(linear) + + def construct(self, x): + return self.decoder(x) + + def generate(self, input_ids, do_sample=False, max_new_tokens=1): + input_ids = np.pad(input_ids, ((0, 0), (0, self.foo_seq_length - input_ids.shape[1])), 'constant', + constant_values=0) + return self.construct(Tensor(input_ids, dtype=dtype.float16)) + +def create_foo_ds(repeat=1): + class SimpleIterable: + def __init__(self, repeat=1): + self._index = 0 + self.data = [] + for _ in range(repeat): + self.data.append(np.array([[1, 1, 1]], dtype=np.int32)) + + def __next__(self): + if self._index >= len(self.data): + raise StopIteration + item = (self.data[self._index],) + self._index += 1 + return item + + def __iter__(self): + self._index = 0 + return self + + def __len__(self): + return len(self.data) + + return GeneratorDataset(source=SimpleIterable(repeat), column_names=["input_ids"]) + + +net = SimpleNet() # The float model that needs to be quantized +ds = create_foo_ds(1) +cfg = PTQConfig(mode=PTQMode.QUANTIZE, backend=BackendTarget.ASCEND, weight_quant_dtype=dtype.int8) +ptq = PTQ(cfg) +ptq.apply(net, datasets=ds) +ptq.convert(net) + +ms.save_checkpoint(net.parameters_dict(), './simplenet_ptq.ckpt') +``` + +1. Use [nn.Cell](https://www.mindspore.cn/docs/en/master/api_python/nn/mindspore.nn.Cell.html) to define the network. After the model is trained, the floating-point weights of the model are obtained. During inference, the floating-point weights of the model are loaded. The preceding example simplifies the process by directly creating a network and quantizing the network using the initial floating-point weights. +2. Use PTQConfig to set the mode to quantization and backend to Ascend for 8-bit quantization of the weights. For details, see [PTQConfig Description](#ptqconfig-description). +3. Use the apply API to convert the network into a fake-quantized network and collect statistics on the quantization objects according to `PTQConfig`. +4. Use the convert API to perform real quantization on the fake-quantized network obtained in the previous step to obtain the quantized network. + +After the quantization is complete, you can use the quantized model for inference. The procedure is as follows: + +```python +import numpy as np +import mindspore as ms +import mindspore.nn as nn +from mindspore import Tensor, dtype +from mindformers.modules import Linear +from mindspore_gs.common import BackendTarget +from mindspore_gs.ptq import PTQMode, PTQConfig +from mindspore_gs.ptq.ptq import PTQ +from mindspore.dataset import GeneratorDataset + +class SimpleNet(nn.Cell): + class DecoderCell(nn.Cell): + def __init__(self, linear): + super().__init__() + self.linear = linear + + def construct(self, *args, **kwargs): + return self.linear(*args, **kwargs) + + def __init__(self, foo_seq_length=1024): + super().__init__() + + self.foo_seq_length = foo_seq_length + linear = Linear(in_channels=foo_seq_length, out_channels=foo_seq_length, weight_init="ones") + self.decoder = SimpleNet.DecoderCell(linear) + + def construct(self, x): + return self.decoder(x) + + def generate(self, input_ids, do_sample=False, max_new_tokens=1): + input_ids = np.pad(input_ids, ((0, 0), (0, self.foo_seq_length - input_ids.shape[1])), 'constant', + constant_values=0) + return self.construct(Tensor(input_ids, dtype=dtype.float16)) + +net = SimpleNet() +cfg = PTQConfig(mode=PTQMode.DEPLOY, backend=BackendTarget.ASCEND, weight_quant_dtype=dtype.int8) +ptq = PTQ(cfg) +ptq.apply(net) +ptq.convert(net) +ms.load_checkpoint('./simplenet_ptq.ckpt', net) + +input = Tensor(np.ones((5, 1024), dtype=np.float32), dtype=dtype.float32) +output = net(input) +print(output) +``` + +1. Use PTQConfig to set the mode to deployment and backend to Ascend for 8-bit quantization of the weights. For details, see [PTQConfig Description](#ptqconfig-description). +2. Use the apply and convert APIs to convert the network into a quantized network. In the deployment phase, no information statistics are collected or quantization computing is performed. Only the network structure is converted into a quantized network. +3. Load the quantized weights to the quantized network for inference. + +### PTQConfig Description + +You can customize the PTQConfig to enable different quantization capabilities. For details about PTQConfig, see the [API document](https://www.mindspore.cn/golden_stick/docs/en/master/ptq/mindspore_gs.ptq.PTQConfig.html#mindspore_gs.ptq.PTQConfig). The following lists the configuration examples of these algorithms: + +> **A** indicates activation, **W** indicates weight, **C** indicates KVCache, and the number indicates the bit. For example, A16W8 indicates that the activation is quantized to float16 and the weight is quantized to int8. + +- A16W8 weight quantization + + ```python + from mindspore import dtype as msdtype + from mindspore_gs.ptq import PTQConfig, OutliersSuppressionType + + ptq_config = PTQConfig(weight_quant_dtype=msdtype.int8, act_quant_dtype=None, kvcache_quant_dtype=None, + outliers_suppression=OutliersSuppressionType.NONE) + ``` + +- A8W8 quantization + + > A8W8 quantization is based on the [SmoothQuant](https://gitcode.com/gh_mirrors/smo/smoothquant/overview) algorithm. PTQConfig provides the **outliers_suppression** field to specify whether to perform the smooth operation. + + ```python + from mindspore import dtype as msdtype + from mindspore_gs.ptq import PTQConfig, OutliersSuppressionType + + ptq_config = PTQConfig(weight_quant_dtype=msdtype.int8, act_quant_dtype=msdtype.int8, kvcache_quant_dtype=None, + outliers_suppression=OutliersSuppressionType.SMOOTH) + ``` + +- KVCache int8 quantization + + ```python + from mindspore import dtype as msdtype + from mindspore_gs.ptq import PTQConfig, OutliersSuppressionType + + ptq_config = PTQConfig(weight_quant_dtype=None, act_quant_dtype=None, kvcache_quant_dtype=msdtype.int8, + outliers_suppression=OutliersSuppressionType.NONE) + ``` + +## Examples + +### PTQ Examples + +The following provides the complete process of quantizing and deploying the post-training quantization (PTQ) algorithm on the Llama2 network: + +- [PTQ algorithm](https://www.mindspore.cn/golden_stick/docs/en/master/ptq/ptq.html): supports 8-bit weight quantization, 8-bit full quantization, and KVCacheInt8 quantization. SmoothQuant can be used to improve the quantization precision. Combined quantization algorithms of different algorithms are supported to improve the quantization inference performance. + +### Perceptual Quantization Training Examples + +- [SimQAT algorithm](https://www.mindspore.cn/golden_stick/docs/en/master/quantization/simulated_quantization.html): A basic quantization aware algorithm based on the fake quantization technology. +- [SLB quantization algorithm](https://www.mindspore.cn/golden_stick/docs/en/master/quantization/slb.html): A non-linear low-bit quantization aware algorithm. + +### Pruning Examples + +- [SCOP pruning algorithm](https://www.mindspore.cn/golden_stick/docs/en/master/pruner/scop.html): A structured weight pruning algorithm. diff --git a/tutorials/source_zh_cn/beginner/tensor.ipynb b/tutorials/source_zh_cn/beginner/tensor.ipynb index af801d8418ba6e1cbc75e039a69f8abc72b0b49d..b881f00120f438c1e88d4e494413d58ea150107d 100644 --- a/tutorials/source_zh_cn/beginner/tensor.ipynb +++ b/tutorials/source_zh_cn/beginner/tensor.ipynb @@ -31,7 +31,6 @@ "source": [ "import numpy as np\n", "import mindspore\n", - "from mindspore import ops\n", "from mindspore import Tensor" ] }, @@ -63,7 +62,7 @@ ], "source": [ "data = [1, 0, 1, 0]\n", - "x_data = Tensor(data)\n", + "x_data = mindspore.tensor(data)\n", "print(x_data, x_data.shape, x_data.dtype)" ] }, @@ -78,21 +77,21 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[1 0 1 0] (4,) Int64\n" + "[1 0 1 0] (4,)\n" ] } ], "source": [ "np_array = np.array(data)\n", - "x_np = Tensor(np_array)\n", - "print(x_np, x_np.shape, x_np.dtype)" + "x_from_np = mindspore.tensor(np_array)\n", + "print(x_from_np, x_from_np.shape)" ] }, { @@ -123,8 +122,8 @@ " [[1. 1.]\n", " [1. 1.]]\n", "tensor2:\n", - " [[-0.00063482 -0.00916224]\n", - " [ 0.01324238 -0.0171206 ]]\n" + " [[-0.0107513 0.00407822]\n", + " [-0.00113699 0.00081491]]\n" ] } ], @@ -220,7 +219,7 @@ } ], "source": [ - "x = Tensor(np.array([[1, 2], [3, 4]]), mindspore.int32)\n", + "x = mindspore.tensor(np.array([[1, 2], [3, 4]]), mindspore.int32)\n", "\n", "print(\"x_shape:\", x.shape)\n", "print(\"x_dtype:\", x.dtype)\n", @@ -257,7 +256,7 @@ } ], "source": [ - "tensor = Tensor(np.array([[0, 1], [2, 3]]).astype(np.float32))\n", + "tensor = mindspore.tensor(np.array([[0, 1], [2, 3]]).astype(np.float32))\n", "\n", "print(\"First row: {}\".format(tensor[0]))\n", "print(\"value of bottom right corner: {}\".format(tensor[1, 1]))\n", @@ -295,8 +294,8 @@ } ], "source": [ - "x = Tensor(np.array([1, 2, 3]), mindspore.float32)\n", - "y = Tensor(np.array([4, 5, 6]), mindspore.float32)\n", + "x = mindspore.tensor(np.array([1, 2, 3]), mindspore.float32)\n", + "y = mindspore.tensor(np.array([4, 5, 6]), mindspore.float32)\n", "\n", "output_add = x + y\n", "output_sub = x - y\n", @@ -339,8 +338,8 @@ } ], "source": [ - "data1 = Tensor(np.array([[0, 1], [2, 3]]).astype(np.float32))\n", - "data2 = Tensor(np.array([[4, 5], [6, 7]]).astype(np.float32))\n", + "data1 = mindspore.tensor(np.array([[0, 1], [2, 3]]).astype(np.float32))\n", + "data2 = mindspore.tensor(np.array([[4, 5], [6, 7]]).astype(np.float32))\n", "output = ops.concat((data1, data2), axis=0)\n", "\n", "print(output)\n", @@ -351,7 +350,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "[stack](https://www.mindspore.cn/docs/zh-CN/master/api_python/ops/mindspore.ops.stack.html)则是从另一个维度上将两个张量合并起来。" + "[stack](https://www.mindspore.cn/docs/zh-CN/master/api_python/ops/mindspore.ops.stack.html) 则是从另一个维度上将两个张量合并起来。" ] }, { @@ -374,8 +373,8 @@ } ], "source": [ - "data1 = Tensor(np.array([[0, 1], [2, 3]]).astype(np.float32))\n", - "data2 = Tensor(np.array([[4, 5], [6, 7]]).astype(np.float32))\n", + "data1 = mindspore.tensor(np.array([[0, 1], [2, 3]]).astype(np.float32))\n", + "data2 = mindspore.tensor(np.array([[4, 5], [6, 7]]).astype(np.float32))\n", "output = ops.stack([data1, data2])\n", "\n", "print(output)\n", @@ -392,7 +391,7 @@ "\n", "### Tensor转换为NumPy\n", "\n", - "可以使用 [Tensor.asnumpy()](https://www.mindspore.cn/docs/zh-CN/master/api_python/mindspore/Tensor/mindspore.Tensor.asnumpy.html) 将Tensor变量转换为NumPy变量。" + "可以使用 [Tensor.asnumpy()](https://www.mindspore.cn/docs/zh-CN/master/api_python/mindspore/Tensor/mindspore.Tensor.asnumpy.html) 将MindSpore的Tensor转换为NumPy数据。" ] }, { @@ -410,7 +409,7 @@ } ], "source": [ - "t = Tensor([1., 1., 1., 1., 1.])\n", + "t = mindspore.tensor([1., 1., 1., 1., 1.])\n", "print(f\"t: {t}\", type(t))\n", "n = t.asnumpy()\n", "print(f\"n: {n}\", type(n))" @@ -422,7 +421,7 @@ "source": [ "### NumPy转换为Tensor\n", "\n", - "使用`Tensor()`将NumPy变量转换为Tensor变量。" + "使用 [Tensor.from_numpy()](https://www.mindspore.cn/docs/zh-CN/master/api_python/mindspore/Tensor/mindspore.Tensor.asnumpy.html) 将NumPy数据转换为MindSpore的Tensor。此方法不拷贝数据,共用数据存储地址,速度较快,但NumPy数据需为连续(用numpy.iscontiguous()判断)。" ] }, { @@ -454,13 +453,50 @@ "print(f\"n: {n}\", type(n))\n", "print(f\"t: {t}\", type(t))" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "使用 [mindspore.tensor](https://www.mindspore.cn/docs/zh-CN/master/api_python/mindspore/mindspore.tensor.html) 直接创建,此方法会对数据进行拷贝。" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "n = np.ones(5)\n", + "t = mindspore.tensor(n)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "n: [2. 2. 2. 2. 2.] \n", + "t: [1. 1. 1. 1. 1.] \n" + ] + } + ], + "source": [ + "np.add(n, 1, out=n)\n", + "print(f\"n: {n}\", type(n))\n", + "print(f\"t: {t}\", type(t))" + ] } ], "metadata": { "kernelspec": { - "display_name": "MindSpore", + "display_name": "ms25-kernel", "language": "python", - "name": "mindspore" + "name": "ms25-kernel" }, "language_info": { "codemirror_mode": { @@ -472,7 +508,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.5" + "version": "3.9.21" } }, "nbformat": 4,