diff --git a/docs/mindformers/docs/Makefile b/docs/mindformers/docs/Makefile
deleted file mode 100644
index 1eff8952707bdfa503c8d60c1e9a903053170ba2..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/Makefile
+++ /dev/null
@@ -1,20 +0,0 @@
-# Minimal makefile for Sphinx documentation
-#
-
-# You can set these variables from the command line, and also
-# from the environment for the first two.
-SPHINXOPTS    ?=
-SPHINXBUILD   ?= sphinx-build
-SOURCEDIR     = source_zh_cn
-BUILDDIR      = build_zh_cn
-
-# Put it first so that "make" without argument is like "make help".
-help:
-	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
-
-.PHONY: help Makefile
-
-# Catch-all target: route all unknown targets to Sphinx using the new
-# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
-%: Makefile
-	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/mindformers/docs/_ext/customdocumenter.txt b/docs/mindformers/docs/_ext/customdocumenter.txt
deleted file mode 100644
index 2d37ae41f6772a21da2a7dc5c7bff75128e68330..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/_ext/customdocumenter.txt
+++ /dev/null
@@ -1,245 +0,0 @@
-import re
-import os
-from sphinx.ext.autodoc import Documenter
-
-
-class CustomDocumenter(Documenter):
-
-    def document_members(self, all_members: bool = False) -> None:
-        """Generate reST for member documentation.
-
-        If *all_members* is True, do all members, else those given by
-        *self.options.members*.
-        """
-        # set current namespace for finding members
-        self.env.temp_data['autodoc:module'] = self.modname
-        if self.objpath:
-            self.env.temp_data['autodoc:class'] = self.objpath[0]
-
-        want_all = all_members or self.options.inherited_members or \
-            self.options.members is ALL
-        # find out which members are documentable
-        members_check_module, members = self.get_object_members(want_all)
-
-        # **** 排除已写中文接口名 ****
-        file_path = os.path.join(self.env.app.srcdir, self.env.docname+'.rst')
-        exclude_re = re.compile(r'(.. py:class::|.. py:function::)\s+(.*?)(\(|\n)')
-        includerst_re = re.compile(r'.. include::\s+(.*?)\n')
-        with open(file_path, 'r', encoding='utf-8') as f:
-            content = f.read()
-            excluded_members = exclude_re.findall(content)
-            if excluded_members:
-                excluded_members = [i[1].split('.')[-1] for i in excluded_members]
-            rst_included = includerst_re.findall(content)
-            if rst_included:
-                for i in rst_included:
-                    include_path = os.path.join(os.path.dirname(file_path), i)
-                    if os.path.exists(include_path):
-                        with open(include_path, 'r', encoding='utf8') as g:
-                            content_ = g.read()
-                            excluded_member_ = exclude_re.findall(content_)
-                            if excluded_member_:
-                                excluded_member_ = [j[1].split('.')[-1] for j in excluded_member_]
-                                excluded_members.extend(excluded_member_)
-
-        if excluded_members:
-            if self.options.exclude_members:
-                self.options.exclude_members |= set(excluded_members)
-            else:
-                self.options.exclude_members = excluded_members
-
-        # remove members given by exclude-members
-        if self.options.exclude_members:
-            members = [
-                (membername, member) for (membername, member) in members
-                if (
-                    self.options.exclude_members is ALL or
-                    membername not in self.options.exclude_members
-                )
-            ]
-
-        # document non-skipped members
-        memberdocumenters = []  # type: List[Tuple[Documenter, bool]]
-        for (mname, member, isattr) in self.filter_members(members, want_all):
-            classes = [cls for cls in self.documenters.values()
-                       if cls.can_document_member(member, mname, isattr, self)]
-            if not classes:
-                # don't know how to document this member
-                continue
-            # prefer the documenter with the highest priority
-            classes.sort(key=lambda cls: cls.priority)
-            # give explicitly separated module name, so that members
-            # of inner classes can be documented
-            full_mname = self.modname + '::' + \
-                '.'.join(self.objpath + [mname])
-            documenter = classes[-1](self.directive, full_mname, self.indent)
-            memberdocumenters.append((documenter, isattr))
-        member_order = self.options.member_order or \
-            self.env.config.autodoc_member_order
-        if member_order == 'groupwise':
-            # sort by group; relies on stable sort to keep items in the
-            # same group sorted alphabetically
-            memberdocumenters.sort(key=lambda e: e[0].member_order)
-        elif member_order == 'bysource' and self.analyzer:
-            # sort by source order, by virtue of the module analyzer
-            tagorder = self.analyzer.tagorder
-
-            def keyfunc(entry: Tuple[Documenter, bool]) -> int:
-                fullname = entry[0].name.split('::')[1]
-                return tagorder.get(fullname, len(tagorder))
-            memberdocumenters.sort(key=keyfunc)
-
-        for documenter, isattr in memberdocumenters:
-            documenter.generate(
-                all_members=True, real_modname=self.real_modname,
-                check_module=members_check_module and not isattr)
-
-        # reset current objects
-        self.env.temp_data['autodoc:module'] = None
-        self.env.temp_data['autodoc:class'] = None
-
-    def generate(self, more_content: Any = None, real_modname: str = None,
-                 check_module: bool = False, all_members: bool = False) -> None:
-        """Generate reST for the object given by *self.name*, and possibly for
-        its members.
-
-        If *more_content* is given, include that content. If *real_modname* is
-        given, use that module name to find attribute docs. If *check_module* is
-        True, only generate if the object is defined in the module name it is
-        imported from. If *all_members* is True, document all members.
-        """
-        if not self.parse_name():
-            # need a module to import
-            logger.warning(
-                __('don\'t know which module to import for autodocumenting '
-                   '%r (try placing a "module" or "currentmodule" directive '
-                   'in the document, or giving an explicit module name)') %
-                self.name, type='autodoc')
-            return
-
-        # now, import the module and get object to document
-        if not self.import_object():
-            return
-
-        # If there is no real module defined, figure out which to use.
-        # The real module is used in the module analyzer to look up the module
-        # where the attribute documentation would actually be found in.
-        # This is used for situations where you have a module that collects the
-        # functions and classes of internal submodules.
-        self.real_modname = real_modname or self.get_real_modname()  # type: str
-
-        # try to also get a source code analyzer for attribute docs
-        try:
-            self.analyzer = ModuleAnalyzer.for_module(self.real_modname)
-            # parse right now, to get PycodeErrors on parsing (results will
-            # be cached anyway)
-            self.analyzer.find_attr_docs()
-        except PycodeError as err:
-            logger.debug('[autodoc] module analyzer failed: %s', err)
-            # no source file -- e.g. for builtin and C modules
-            self.analyzer = None
-            # at least add the module.__file__ as a dependency
-            if hasattr(self.module, '__file__') and self.module.__file__:
-                self.directive.filename_set.add(self.module.__file__)
-        else:
-            self.directive.filename_set.add(self.analyzer.srcname)
-
-        # check __module__ of object (for members not given explicitly)
-        if check_module:
-            if not self.check_module():
-                return
-
-        # document members, if possible
-        self.document_members(all_members)
-
-
-class ModuleDocumenter(CustomDocumenter):
-    """
-    Specialized Documenter subclass for modules.
-    """
-    objtype = 'module'
-    content_indent = ''
-    titles_allowed = True
-
-    option_spec = {
-        'members': members_option, 'undoc-members': bool_option,
-        'noindex': bool_option, 'inherited-members': bool_option,
-        'show-inheritance': bool_option, 'synopsis': identity,
-        'platform': identity, 'deprecated': bool_option,
-        'member-order': identity, 'exclude-members': members_set_option,
-        'private-members': bool_option, 'special-members': members_option,
-        'imported-members': bool_option, 'ignore-module-all': bool_option
-    }  # type: Dict[str, Callable]
-
-    def __init__(self, *args: Any) -> None:
-        super().__init__(*args)
-        merge_members_option(self.options)
-
-    @classmethod
-    def can_document_member(cls, member: Any, membername: str, isattr: bool, parent: Any
-                            ) -> bool:
-        # don't document submodules automatically
-        return False
-
-    def resolve_name(self, modname: str, parents: Any, path: str, base: Any
-                     ) -> Tuple[str, List[str]]:
-        if modname is not None:
-            logger.warning(__('"::" in automodule name doesn\'t make sense'),
-                           type='autodoc')
-        return (path or '') + base, []
-
-    def parse_name(self) -> bool:
-        ret = super().parse_name()
-        if self.args or self.retann:
-            logger.warning(__('signature arguments or return annotation '
-                              'given for automodule %s') % self.fullname,
-                           type='autodoc')
-        return ret
-
-    def add_directive_header(self, sig: str) -> None:
-        Documenter.add_directive_header(self, sig)
-
-        sourcename = self.get_sourcename()
-
-        # add some module-specific options
-        if self.options.synopsis:
-            self.add_line('   :synopsis: ' + self.options.synopsis, sourcename)
-        if self.options.platform:
-            self.add_line('   :platform: ' + self.options.platform, sourcename)
-        if self.options.deprecated:
-            self.add_line('   :deprecated:', sourcename)
-
-    def get_object_members(self, want_all: bool) -> Tuple[bool, List[Tuple[str, object]]]:
-        if want_all:
-            if (self.options.ignore_module_all or not
-                    hasattr(self.object, '__all__')):
-                # for implicit module members, check __module__ to avoid
-                # documenting imported objects
-                return True, get_module_members(self.object)
-            else:
-                memberlist = self.object.__all__
-                # Sometimes __all__ is broken...
-                if not isinstance(memberlist, (list, tuple)) or not \
-                   all(isinstance(entry, str) for entry in memberlist):
-                    logger.warning(
-                        __('__all__ should be a list of strings, not %r '
-                           '(in module %s) -- ignoring __all__') %
-                        (memberlist, self.fullname),
-                        type='autodoc'
-                    )
-                    # fall back to all members
-                    return True, get_module_members(self.object)
-        else:
-            memberlist = self.options.members or []
-        ret = []
-        for mname in memberlist:
-            try:
-                ret.append((mname, safe_getattr(self.object, mname)))
-            except AttributeError:
-                logger.warning(
-                    __('missing attribute mentioned in :members: or __all__: '
-                       'module %s, attribute %s') %
-                    (safe_getattr(self.object, '__name__', '???'), mname),
-                    type='autodoc'
-                )
-        return False, ret
diff --git a/docs/mindformers/docs/_ext/myautosummary.py b/docs/mindformers/docs/_ext/myautosummary.py
deleted file mode 100644
index 581230d590d394038ef5ef285405a06b5300c254..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/_ext/myautosummary.py
+++ /dev/null
@@ -1,536 +0,0 @@
-"""Customized autosummary directives for sphinx."""
-import os
-import re
-import inspect
-import importlib
-from typing import List, Tuple
-from docutils.nodes import Node
-from sphinx.locale import __
-from sphinx.ext.autosummary import Autosummary, posixpath, addnodes, logger, Matcher, autosummary_toc, get_import_prefixes_from_env
-from sphinx.ext.autosummary import mock, StringList, ModuleType, get_documenter, ModuleAnalyzer, PycodeError, mangle_signature
-from sphinx.ext.autosummary import import_by_name, extract_summary, autosummary_table, nodes, switch_source_input, rst
-from sphinx.ext.autodoc.directive import DocumenterBridge, Options
-
-
-class MsAutosummary(Autosummary):
-    """
-    Inherited from sphinx's autosummary, add titles and a column for the generated table.
-    """
-
-    def init(self):
-        """
-        init method
-        """
-        self.find_doc_name = ""
-        self.third_title = ""
-        self.default_doc = ""
-
-    def extract_env_summary(self, doc: List[str]) -> str:
-        """Extract env summary from docstring."""
-        env_sum = self.default_doc
-        for i, piece in enumerate(doc):
-            if piece.startswith(self.find_doc_name):
-                env_sum = doc[i+1][4:]
-        return env_sum
-
-    def run(self):
-        """
-        run method
-        """
-        self.init()
-        self.bridge = DocumenterBridge(self.env, self.state.document.reporter,
-                                       Options(), self.lineno, self.state)
-
-        names = [x.strip().split()[0] for x in self.content
-                 if x.strip() and re.search(r'^[~a-zA-Z_]', x.strip()[0])]
-        items = self.get_items(names)
-        teble_nodes = self.get_table(items)
-
-        if 'toctree' in self.options:
-            dirname = posixpath.dirname(self.env.docname)
-
-            tree_prefix = self.options['toctree'].strip()
-            docnames = []
-            excluded = Matcher(self.config.exclude_patterns)
-            for item in items:
-                docname = posixpath.join(tree_prefix, item[3])
-                docname = posixpath.normpath(posixpath.join(dirname, docname))
-                if docname not in self.env.found_docs:
-                    location = self.state_machine.get_source_and_line(self.lineno)
-                    if excluded(self.env.doc2path(docname, None)):
-                        msg = __('autosummary references excluded document %r. Ignored.')
-                    else:
-                        msg = __('autosummary: stub file not found %r. '
-                                 'Check your autosummary_generate setting.')
-                    logger.warning(msg, item[3], location=location)
-                    continue
-                docnames.append(docname)
-
-            if docnames:
-                tocnode = addnodes.toctree()
-                tocnode['includefiles'] = docnames
-                tocnode['entries'] = [(None, docn) for docn in docnames]
-                tocnode['maxdepth'] = -1
-                tocnode['glob'] = None
-                teble_nodes.append(autosummary_toc('', '', tocnode))
-        return teble_nodes
-
-    def get_items(self, names: List[str]) -> List[Tuple[str, str, str, str, str]]:
-        """Try to import the given names, and return a list of
-        ``[(name, signature, summary_string, real_name, env_summary), ...]``.
-        """
-        prefixes = get_import_prefixes_from_env(self.env)
-        items = []  # type: List[Tuple[str, str, str, str, str]]
-        max_item_chars = 50
-
-        for name in names:
-            display_name = name
-            if name.startswith('~'):
-                name = name[1:]
-                display_name = name.split('.')[-1]
-            try:
-                with mock(self.config.autosummary_mock_imports):
-                    real_name, obj, parent, modname = import_by_name(name, prefixes=prefixes)
-            except ImportError:
-                logger.warning(__('failed to import %s'), name)
-                items.append((name, '', '', name, ''))
-                continue
-
-            self.bridge.result = StringList()  # initialize for each documenter
-            full_name = real_name
-            if not isinstance(obj, ModuleType):
-                # give explicitly separated module name, so that members
-                # of inner classes can be documented
-                full_name = modname + '::' + full_name[len(modname) + 1:]
-            # NB. using full_name here is important, since Documenters
-            #     handle module prefixes slightly differently
-            doccls = get_documenter(self.env.app, obj, parent)
-            documenter = doccls(self.bridge, full_name)
-
-            if not documenter.parse_name():
-                logger.warning(__('failed to parse name %s'), real_name)
-                items.append((display_name, '', '', real_name, ''))
-                continue
-            if not documenter.import_object():
-                logger.warning(__('failed to import object %s'), real_name)
-                items.append((display_name, '', '', real_name, ''))
-                continue
-            if documenter.options.members and not documenter.check_module():
-                continue
-
-            # try to also get a source code analyzer for attribute docs
-            try:
-                documenter.analyzer = ModuleAnalyzer.for_module(
-                    documenter.get_real_modname())
-                # parse right now, to get PycodeErrors on parsing (results will
-                # be cached anyway)
-                documenter.analyzer.find_attr_docs()
-            except PycodeError as err:
-                logger.debug('[autodoc] module analyzer failed: %s', err)
-                # no source file -- e.g. for builtin and C modules
-                documenter.analyzer = None
-
-            # -- Grab the signature
-
-            try:
-                sig = documenter.format_signature(show_annotation=False)
-            except TypeError:
-                # the documenter does not support ``show_annotation`` option
-                sig = documenter.format_signature()
-
-            if not sig:
-                sig = ''
-            else:
-                max_chars = max(10, max_item_chars - len(display_name))
-                sig = mangle_signature(sig, max_chars=max_chars)
-
-            # -- Grab the summary
-
-            documenter.add_content(None)
-            summary = extract_summary(self.bridge.result.data[:], self.state.document)
-            env_sum = self.extract_env_summary(self.bridge.result.data[:])
-            items.append((display_name, sig, summary, real_name, env_sum))
-
-        return items
-
-    def get_table(self, items: List[Tuple[str, str, str, str, str]]) -> List[Node]:
-        """Generate a proper list of table nodes for autosummary:: directive.
-
-        *items* is a list produced by :meth:`get_items`.
-        """
-        table_spec = addnodes.tabular_col_spec()
-        table_spec['spec'] = r'\X{1}{2}\X{1}{2}'
-
-        table = autosummary_table('')
-        real_table = nodes.table('', classes=['longtable'])
-        table.append(real_table)
-        group = nodes.tgroup('', cols=3)
-        real_table.append(group)
-        group.append(nodes.colspec('', colwidth=10))
-        group.append(nodes.colspec('', colwidth=70))
-        group.append(nodes.colspec('', colwidth=30))
-        body = nodes.tbody('')
-        group.append(body)
-
-        def append_row(*column_texts: str) -> None:
-            row = nodes.row('', color="red")
-            source, line = self.state_machine.get_source_and_line()
-            for text in column_texts:
-                node = nodes.paragraph('')
-                vl = StringList()
-                vl.append(text, '%s:%d:<autosummary>' % (source, line))
-                with switch_source_input(self.state, vl):
-                    self.state.nested_parse(vl, 0, node)
-                    try:
-                        if isinstance(node[0], nodes.paragraph):
-                            node = node[0]
-                    except IndexError:
-                        pass
-                    row.append(nodes.entry('', node))
-            body.append(row)
-
-        # add table's title
-        append_row("**API Name**", "**Description**", self.third_title)
-        for name, sig, summary, real_name, env_sum in items:
-            qualifier = 'obj'
-            if 'nosignatures' not in self.options:
-                col1 = ':%s:`%s <%s>`\\ %s' % (qualifier, name, real_name, rst.escape(sig))
-            else:
-                col1 = ':%s:`%s <%s>`' % (qualifier, name, real_name)
-            col2 = summary
-            col3 = env_sum
-            append_row(col1, col2, col3)
-
-        return [table_spec, table]
-
-
-class MsNoteAutoSummary(MsAutosummary):
-    """
-    Inherited from MsAutosummary. Add a third column about `Note` to the table.
-    """
-
-    def init(self):
-        """
-        init method
-        """
-        self.find_doc_name = ".. note::"
-        self.third_title = "**Note**"
-        self.default_doc = "None"
-
-    def extract_env_summary(self, doc: List[str]) -> str:
-        """Extract env summary from docstring."""
-        env_sum = self.default_doc
-        for piece in doc:
-            if piece.startswith(self.find_doc_name):
-                env_sum = piece[10:]
-        return env_sum
-
-class MsPlatformAutoSummary(MsAutosummary):
-    """
-    Inherited from MsAutosummary. Add a third column about `Supported Platforms` to the table.
-    """
-    def init(self):
-        """
-        init method
-        """
-        self.find_doc_name = "Supported Platforms:"
-        self.third_title = "**{}**".format(self.find_doc_name[:-1])
-        self.default_doc = "``Ascend`` ``GPU`` ``CPU``"
-
-class MsCnAutoSummary(Autosummary):
-    """Overwrite MsPlatformAutosummary for chinese python api."""
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.table_head = ()
-        self.find_doc_name = ""
-        self.third_title = ""
-        self.default_doc = ""
-        self.third_name_en = ""
-
-    def get_third_column_en(self, doc):
-        """Get the third column for en."""
-        third_column = self.default_doc
-        for i, piece in enumerate(doc):
-            if piece.startswith(self.third_name_en):
-                try:
-                    if "eprecated" in doc[i+1][4:]:
-                        third_column = "弃用"
-                    else:
-                        third_column = doc[i+1][4:]
-                except IndexError:
-                    third_column = ''
-        return third_column
-
-    def get_summary_re(self, display_name: str):
-        return re.compile(rf'\.\. \w+:\w+::\s+{display_name}.*?\n\n\s+(.*?)[。\n]')
-
-    def run(self) -> List[Node]:
-        self.bridge = DocumenterBridge(self.env, self.state.document.reporter,
-                                       Options(), self.lineno, self.state)
-
-        names = [x.strip().split()[0] for x in self.content
-                 if x.strip() and re.search(r'^[~a-zA-Z_]', x.strip()[0])]
-        items = self.get_items(names)
-        #pylint: disable=redefined-outer-name
-        nodes = self.get_table(items)
-
-        dirname = posixpath.dirname(self.env.docname)
-
-        tree_prefix = self.options['toctree'].strip()
-        docnames = []
-        names = [i[0] for i in items]
-        for name in names:
-            docname = posixpath.join(tree_prefix, name)
-            docname = posixpath.normpath(posixpath.join(dirname, docname))
-            if docname not in self.env.found_docs:
-                continue
-
-            docnames.append(docname)
-
-        if docnames:
-            tocnode = addnodes.toctree()
-            tocnode['includefiles'] = docnames
-            tocnode['entries'] = [(None, docn) for docn in docnames]
-            tocnode['maxdepth'] = -1
-            tocnode['glob'] = None
-
-            nodes.append(autosummary_toc('', '', tocnode))
-
-        return nodes
-
-    def get_items(self, names: List[str]) -> List[Tuple[str, str, str, str]]:
-        """Try to import the given names, and return a list of
-        ``[(name, signature, summary_string, real_name), ...]``.
-        """
-        prefixes = get_import_prefixes_from_env(self.env)
-        doc_path = os.path.dirname(self.state.document.current_source)
-        items = []  # type: List[Tuple[str, str, str, str]]
-        max_item_chars = 50
-        origin_rst_files = self.env.config.rst_files
-        all_rst_files = self.env.found_docs
-        generated_files = all_rst_files.difference(origin_rst_files)
-
-        for name in names:
-            display_name = name
-            if name.startswith('~'):
-                name = name[1:]
-                display_name = name.split('.')[-1]
-
-            dir_name = self.options['toctree']
-            spec_path = os.path.join('api_python', dir_name, display_name)
-            file_path = os.path.join(doc_path, dir_name, display_name+'.rst')
-            if os.path.exists(file_path) and spec_path not in generated_files:
-                summary_re_tag = re.compile(rf'\.\. \w+:\w+::\s+{display_name}.*?\n\s+:.*?:\n\n\s+(.*?)[。\n]')
-                summary_re_line = re.compile(rf'\.\. \w+:\w+::\s+{display_name}(?:.|\n|)+?\n\n\s+(.*?)[。\n]')
-                summary_re = self.get_summary_re(display_name)
-                content = ''
-                with open(file_path, 'r', encoding='utf-8') as f:
-                    content = f.read()
-                if content:
-                    summary_str = summary_re.findall(content)
-                    summary_str_tag = summary_re_tag.findall(content)
-                    summary_str_line = summary_re_line.findall(content)
-                    if summary_str:
-                        if re.findall("[:：,，。.;；]", summary_str[0][-1]):
-                            logger.warning(f"{display_name}接口的概述格式需调整")
-                        summary_str = summary_str[0] + '。'
-                    elif summary_str_tag:
-                        if re.findall("[:：,，。.;；]", summary_str_tag[0][-1]):
-                            logger.warning(f"{display_name}接口的概述格式需调整")
-                        summary_str = summary_str_tag[0] + '。'
-                    elif summary_str_line:
-                        if re.findall("[:：,，。.;；]", summary_str_line[0][-1]):
-                            logger.warning(f"{display_name}接口的概述格式需调整")
-                        summary_str = summary_str_line[0] + '。'
-                    else:
-                        summary_str = ''
-                    if not self.table_head:
-                        items.append((display_name, summary_str))
-                    else:
-                        third_str = self.get_third_column(display_name, content)
-                        if third_str:
-                            third_str = third_str[0]
-                        else:
-                            third_str = ''
-
-                        items.append((display_name, summary_str, third_str))
-            else:
-                try:
-                    with mock(self.config.autosummary_mock_imports):
-                        real_name, obj, parent, modname = import_by_name(name, prefixes=prefixes)
-                except ImportError:
-                    logger.warning(__('failed to import %s'), name)
-                    items.append((name, '', ''))
-                    continue
-
-                self.bridge.result = StringList()  # initialize for each documenter
-                full_name = real_name
-                if not isinstance(obj, ModuleType):
-                    # give explicitly separated module name, so that members
-                    # of inner classes can be documented
-                    full_name = modname + '::' + full_name[len(modname) + 1:]
-                # NB. using full_name here is important, since Documenters
-                #     handle module prefixes slightly differently
-                doccls = get_documenter(self.env.app, obj, parent)
-                documenter = doccls(self.bridge, full_name)
-
-                if not documenter.parse_name():
-                    logger.warning(__('failed to parse name %s'), real_name)
-                    items.append((display_name, '', ''))
-                    continue
-                if not documenter.import_object():
-                    logger.warning(__('failed to import object %s'), real_name)
-                    items.append((display_name, '', ''))
-                    continue
-                if documenter.options.members and not documenter.check_module():
-                    continue
-
-                # try to also get a source code analyzer for attribute docs
-                try:
-                    documenter.analyzer = ModuleAnalyzer.for_module(
-                        documenter.get_real_modname())
-                    # parse right now, to get PycodeErrors on parsing (results will
-                    # be cached anyway)
-                    documenter.analyzer.find_attr_docs()
-                except PycodeError as err:
-                    logger.debug('[autodoc] module analyzer failed: %s', err)
-                    # no source file -- e.g. for builtin and C modules
-                    documenter.analyzer = None
-
-                # -- Grab the signature
-
-                try:
-                    sig = documenter.format_signature(show_annotation=False)
-                except TypeError:
-                    # the documenter does not support ``show_annotation`` option
-                    sig = documenter.format_signature()
-
-                if not sig:
-                    sig = ''
-                else:
-                    max_chars = max(10, max_item_chars - len(display_name))
-                    sig = mangle_signature(sig, max_chars=max_chars)
-
-                # -- Grab the summary and third_colum
-
-                documenter.add_content(None)
-                summary = extract_summary(self.bridge.result.data[:], self.state.document)
-                if self.table_head:
-                    third_colum = self.get_third_column_en(self.bridge.result.data[:])
-                    items.append((display_name, summary, third_colum))
-                else:
-                    items.append((display_name, summary))
-
-
-        return items
-
-    def get_table(self, items: List[Tuple[str, str, str]]) -> List[Node]:
-        """Generate a proper list of table nodes for autosummary:: directive.
-
-        *items* is a list produced by :meth:`get_items`.
-        """
-        table_spec = addnodes.tabular_col_spec()
-        table = autosummary_table('')
-        real_table = nodes.table('', classes=['longtable'])
-        table.append(real_table)
-
-        if not self.table_head:
-            table_spec['spec'] = r'\X{1}{2}\X{1}{2}'
-            group = nodes.tgroup('', cols=2)
-            real_table.append(group)
-            group.append(nodes.colspec('', colwidth=10))
-            group.append(nodes.colspec('', colwidth=90))
-        else:
-            table_spec['spec'] = r'\X{1}{2}\X{1}{2}\X{1}{2}'
-            group = nodes.tgroup('', cols=3)
-            real_table.append(group)
-            group.append(nodes.colspec('', colwidth=10))
-            group.append(nodes.colspec('', colwidth=60))
-            group.append(nodes.colspec('', colwidth=30))
-        body = nodes.tbody('')
-        group.append(body)
-
-        def append_row(*column_texts: str) -> None:
-            row = nodes.row('')
-            source, line = self.state_machine.get_source_and_line()
-            for text in column_texts:
-                node = nodes.paragraph('')
-                vl = StringList()
-                vl.append(text, '%s:%d:<autosummary>' % (source, line))
-                with switch_source_input(self.state, vl):
-                    self.state.nested_parse(vl, 0, node)
-                    try:
-                        if isinstance(node[0], nodes.paragraph):
-                            node = node[0]
-                    except IndexError:
-                        pass
-                    row.append(nodes.entry('', node))
-            body.append(row)
-        append_row(*self.table_head)
-        if not self.table_head:
-            try:
-                for name, summary in items:
-                    qualifier = 'obj'
-                    col1 = ':%s:`%s <%s>`' % (qualifier, name, name)
-                    col2 = summary
-                    append_row(col1, col2)
-            except ValueError:
-                logger.warning(items)
-        else:
-            for name, summary, other in items:
-                qualifier = 'obj'
-                col1 = ':%s:`%s <%s>`' % (qualifier, name, name)
-                col2 = summary
-                col3 = other
-                append_row(col1, col2, col3)
-        return [table_spec, table]
-
-def get_api(fullname):
-    """Get the api module."""
-    try:
-        module_name, api_name = ".".join(fullname.split('.')[:-1]), fullname.split('.')[-1]
-        # pylint: disable=unused-variable
-        module_import = importlib.import_module(module_name)
-    except ModuleNotFoundError:
-        module_name, api_name = ".".join(fullname.split('.')[:-2]), ".".join(fullname.split('.')[-2:])
-        module_import = importlib.import_module(module_name)
-    # pylint: disable=eval-used
-    api = eval(f"module_import.{api_name}")
-    return api
-
-class MsCnPlatformAutoSummary(MsCnAutoSummary):
-    """definition of cnmsplatformautosummary."""
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.table_head = ('**接口名**', '**概述**', '**支持平台**')
-        self.third_name_en = "Supported Platforms:"
-
-    def get_third_column(self, name=None, content=None):
-        """Get the`Supported Platforms`."""
-        if not name:
-            return []
-        try:
-            api_doc = inspect.getdoc(get_api(name))
-            platform_str = re.findall(r'Supported Platforms:\n\s+(.*?)\n\n', api_doc)
-            if ['deprecated'] == platform_str:
-                return ["弃用"]
-            if not platform_str:
-                platform_str_leak = re.findall(r'Supported Platforms:\n\s+(.*)', api_doc)
-                if platform_str_leak:
-                    return platform_str_leak
-                return ["``Ascend`` ``GPU`` ``CPU``"]
-            return platform_str
-        except: #pylint: disable=bare-except
-            return []
-
-class MsCnNoteAutoSummary(MsCnAutoSummary):
-    """definition of cnmsnoteautosummary."""
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.table_head = ('**接口名**', '**概述**', '**说明**')
-        self.third_name_en = ".. note::"
-
-    def get_third_column(self, name=None, content=''):
-        note_re = re.compile(r'\.\. note::\n{,2}\s+(.*?)[。\n]')
-        third_str = note_re.findall(content)
-        return third_str
diff --git a/docs/mindformers/docs/_ext/overwriteautosummary_generate.txt b/docs/mindformers/docs/_ext/overwriteautosummary_generate.txt
deleted file mode 100644
index 4b0a1b1dd2b410ecab971b13da9993c90d65ef0d..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/_ext/overwriteautosummary_generate.txt
+++ /dev/null
@@ -1,707 +0,0 @@
-"""
-    sphinx.ext.autosummary.generate
-    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-    Usable as a library or script to generate automatic RST source files for
-    items referred to in autosummary:: directives.
-
-    Each generated RST file contains a single auto*:: directive which
-    extracts the docstring of the referred item.
-
-    Example Makefile rule::
-
-       generate:
-               sphinx-autogen -o source/generated source/*.rst
-
-    :copyright: Copyright 2007-2022 by the Sphinx team, see AUTHORS.
-    :license: BSD, see LICENSE for details.
-"""
-
-import argparse
-import importlib
-import inspect
-import locale
-import os
-import pkgutil
-import pydoc
-import re
-import sys
-import warnings
-from gettext import NullTranslations
-from os import path
-from typing import Any, Dict, List, NamedTuple, Sequence, Set, Tuple, Type, Union
-
-from jinja2 import TemplateNotFound
-from jinja2.sandbox import SandboxedEnvironment
-
-import sphinx.locale
-from sphinx import __display_version__, package_dir
-from sphinx.application import Sphinx
-from sphinx.builders import Builder
-from sphinx.config import Config
-from sphinx.deprecation import RemovedInSphinx50Warning
-from sphinx.ext.autodoc import Documenter
-from sphinx.ext.autodoc.importer import import_module
-from sphinx.ext.autosummary import (ImportExceptionGroup, get_documenter, import_by_name,
-                                    import_ivar_by_name)
-from sphinx.locale import __
-from sphinx.pycode import ModuleAnalyzer, PycodeError
-from sphinx.registry import SphinxComponentRegistry
-from sphinx.util import logging, rst, split_full_qualified_name, get_full_modname
-from sphinx.util.inspect import getall, safe_getattr
-from sphinx.util.osutil import ensuredir
-from sphinx.util.template import SphinxTemplateLoader
-
-logger = logging.getLogger(__name__)
-
-
-class DummyApplication:
-    """Dummy Application class for sphinx-autogen command."""
-
-    def __init__(self, translator: NullTranslations) -> None:
-        self.config = Config()
-        self.registry = SphinxComponentRegistry()
-        self.messagelog: List[str] = []
-        self.srcdir = "/"
-        self.translator = translator
-        self.verbosity = 0
-        self._warncount = 0
-        self.warningiserror = False
-
-        self.config.add('autosummary_context', {}, True, None)
-        self.config.add('autosummary_filename_map', {}, True, None)
-        self.config.add('autosummary_ignore_module_all', True, 'env', bool)
-        self.config.add('docs_branch', '', True, None)
-        self.config.add('branch', '', True, None)
-        self.config.add('cst_module_name', '', True, None)
-        self.config.add('copy_repo', '', True, None)
-        self.config.add('giturl', '', True, None)
-        self.config.add('repo_whl', '', True, None)
-        self.config.init_values()
-
-    def emit_firstresult(self, *args: Any) -> None:
-        pass
-
-
-class AutosummaryEntry(NamedTuple):
-    name: str
-    path: str
-    template: str
-    recursive: bool
-
-
-def setup_documenters(app: Any) -> None:
-    from sphinx.ext.autodoc import (AttributeDocumenter, ClassDocumenter, DataDocumenter,
-                                    DecoratorDocumenter, ExceptionDocumenter,
-                                    FunctionDocumenter, MethodDocumenter, ModuleDocumenter,
-                                    NewTypeAttributeDocumenter, NewTypeDataDocumenter,
-                                    PropertyDocumenter)
-    documenters: List[Type[Documenter]] = [
-        ModuleDocumenter, ClassDocumenter, ExceptionDocumenter, DataDocumenter,
-        FunctionDocumenter, MethodDocumenter, NewTypeAttributeDocumenter,
-        NewTypeDataDocumenter, AttributeDocumenter, DecoratorDocumenter, PropertyDocumenter,
-    ]
-    for documenter in documenters:
-        app.registry.add_documenter(documenter.objtype, documenter)
-
-
-def _simple_info(msg: str) -> None:
-    warnings.warn('_simple_info() is deprecated.',
-                  RemovedInSphinx50Warning, stacklevel=2)
-    print(msg)
-
-
-def _simple_warn(msg: str) -> None:
-    warnings.warn('_simple_warn() is deprecated.',
-                  RemovedInSphinx50Warning, stacklevel=2)
-    print('WARNING: ' + msg, file=sys.stderr)
-
-
-def _underline(title: str, line: str = '=') -> str:
-    if '\n' in title:
-        raise ValueError('Can only underline single lines')
-    return title + '\n' + line * len(title)
-
-
-class AutosummaryRenderer:
-    """A helper class for rendering."""
-
-    def __init__(self, app: Union[Builder, Sphinx], template_dir: str = None) -> None:
-        if isinstance(app, Builder):
-            warnings.warn('The first argument for AutosummaryRenderer has been '
-                          'changed to Sphinx object',
-                          RemovedInSphinx50Warning, stacklevel=2)
-        if template_dir:
-            warnings.warn('template_dir argument for AutosummaryRenderer is deprecated.',
-                          RemovedInSphinx50Warning, stacklevel=2)
-
-        system_templates_path = [os.path.join(package_dir, 'ext', 'autosummary', 'templates')]
-        loader = SphinxTemplateLoader(app.srcdir, app.config.templates_path,
-                                      system_templates_path)
-
-        self.env = SandboxedEnvironment(loader=loader)
-        self.env.filters['escape'] = rst.escape
-        self.env.filters['e'] = rst.escape
-        self.env.filters['underline'] = _underline
-
-        if isinstance(app, (Sphinx, DummyApplication)):
-            if app.translator:
-                self.env.add_extension("jinja2.ext.i18n")
-                self.env.install_gettext_translations(app.translator)
-        elif isinstance(app, Builder):
-            if app.app.translator:
-                self.env.add_extension("jinja2.ext.i18n")
-                self.env.install_gettext_translations(app.app.translator)
-
-    def exists(self, template_name: str) -> bool:
-        """Check if template file exists."""
-        warnings.warn('AutosummaryRenderer.exists() is deprecated.',
-                      RemovedInSphinx50Warning, stacklevel=2)
-        try:
-            self.env.get_template(template_name)
-            return True
-        except TemplateNotFound:
-            return False
-
-    def render(self, template_name: str, context: Dict) -> str:
-        """Render a template file."""
-        try:
-            template = self.env.get_template(template_name)
-        except TemplateNotFound:
-            try:
-                # objtype is given as template_name
-                template = self.env.get_template('autosummary/%s.rst' % template_name)
-            except TemplateNotFound:
-                # fallback to base.rst
-                template = self.env.get_template('autosummary/base.rst')
-
-        return template.render(context)
-
-
-# -- Generating output ---------------------------------------------------------
-
-
-class ModuleScanner:
-    def __init__(self, app: Any, obj: Any) -> None:
-        self.app = app
-        self.object = obj
-
-    def get_object_type(self, name: str, value: Any) -> str:
-        return get_documenter(self.app, value, self.object).objtype
-
-    def is_skipped(self, name: str, value: Any, objtype: str) -> bool:
-        try:
-            return self.app.emit_firstresult('autodoc-skip-member', objtype,
-                                             name, value, False, {})
-        except Exception as exc:
-            logger.warning(__('autosummary: failed to determine %r to be documented, '
-                              'the following exception was raised:\n%s'),
-                           name, exc, type='autosummary')
-            return False
-
-    def scan(self, imported_members: bool) -> List[str]:
-        members = []
-        for name in members_of(self.object, self.app.config):
-            try:
-                value = safe_getattr(self.object, name)
-            except AttributeError:
-                value = None
-
-            objtype = self.get_object_type(name, value)
-            if self.is_skipped(name, value, objtype):
-                continue
-
-            try:
-                if inspect.ismodule(value):
-                    imported = True
-                elif safe_getattr(value, '__module__') != self.object.__name__:
-                    imported = True
-                else:
-                    imported = False
-            except AttributeError:
-                imported = False
-
-            respect_module_all = not self.app.config.autosummary_ignore_module_all
-            if imported_members:
-                # list all members up
-                members.append(name)
-            elif imported is False:
-                # list not-imported members
-                members.append(name)
-            elif '__all__' in dir(self.object) and respect_module_all:
-                # list members that have __all__ set
-                members.append(name)
-
-        return members
-
-
-def members_of(obj: Any, conf: Config) -> Sequence[str]:
-    """Get the members of ``obj``, possibly ignoring the ``__all__`` module attribute
-
-    Follows the ``conf.autosummary_ignore_module_all`` setting."""
-
-    if conf.autosummary_ignore_module_all:
-        return dir(obj)
-    else:
-        return getall(obj) or dir(obj)
-
-
-def generate_autosummary_content(name: str, obj: Any, parent: Any,
-                                 template: AutosummaryRenderer, template_name: str,
-                                 imported_members: bool, app: Any,
-                                 recursive: bool, context: Dict,
-                                 modname: str = None, qualname: str = None) -> str:
-    doc = get_documenter(app, obj, parent)
-
-    def skip_member(obj: Any, name: str, objtype: str) -> bool:
-        try:
-            return app.emit_firstresult('autodoc-skip-member', objtype, name,
-                                        obj, False, {})
-        except Exception as exc:
-            logger.warning(__('autosummary: failed to determine %r to be documented, '
-                              'the following exception was raised:\n%s'),
-                           name, exc, type='autosummary')
-            return False
-
-    def get_class_members(obj: Any) -> Dict[str, Any]:
-        members = sphinx.ext.autodoc.get_class_members(obj, [qualname], safe_getattr)
-        return {name: member.object for name, member in members.items()}
-
-    def get_module_members(obj: Any) -> Dict[str, Any]:
-        members = {}
-        for name in members_of(obj, app.config):
-            try:
-                members[name] = safe_getattr(obj, name)
-            except AttributeError:
-                continue
-        return members
-
-    def get_all_members(obj: Any) -> Dict[str, Any]:
-        if doc.objtype == "module":
-            return get_module_members(obj)
-        elif doc.objtype == "class":
-            return get_class_members(obj)
-        return {}
-
-    def get_members(obj: Any, types: Set[str], include_public: List[str] = [],
-                    imported: bool = True) -> Tuple[List[str], List[str]]:
-        items: List[str] = []
-        public: List[str] = []
-
-        all_members = get_all_members(obj)
-        for name, value in all_members.items():
-            documenter = get_documenter(app, value, obj)
-            if documenter.objtype in types:
-                # skip imported members if expected
-                if imported or getattr(value, '__module__', None) == obj.__name__:
-                    skipped = skip_member(value, name, documenter.objtype)
-                    if skipped is True:
-                        pass
-                    elif skipped is False:
-                        # show the member forcedly
-                        items.append(name)
-                        public.append(name)
-                    else:
-                        items.append(name)
-                        if name in include_public or not name.startswith('_'):
-                            # considers member as public
-                            public.append(name)
-        return public, items
-
-    def get_module_attrs(members: Any) -> Tuple[List[str], List[str]]:
-        """Find module attributes with docstrings."""
-        attrs, public = [], []
-        try:
-            analyzer = ModuleAnalyzer.for_module(name)
-            attr_docs = analyzer.find_attr_docs()
-            for namespace, attr_name in attr_docs:
-                if namespace == '' and attr_name in members:
-                    attrs.append(attr_name)
-                    if not attr_name.startswith('_'):
-                        public.append(attr_name)
-        except PycodeError:
-            pass    # give up if ModuleAnalyzer fails to parse code
-        return public, attrs
-
-    def get_modules(obj: Any) -> Tuple[List[str], List[str]]:
-        items: List[str] = []
-        for _, modname, _ispkg in pkgutil.iter_modules(obj.__path__):
-            fullname = name + '.' + modname
-            try:
-                module = import_module(fullname)
-                if module and hasattr(module, '__sphinx_mock__'):
-                    continue
-            except ImportError:
-                pass
-
-            items.append(fullname)
-        public = [x for x in items if not x.split('.')[-1].startswith('_')]
-        return public, items
-
-    ns: Dict[str, Any] = {}
-    ns.update(context)
-
-    if doc.objtype == 'module':
-        scanner = ModuleScanner(app, obj)
-        ns['members'] = scanner.scan(imported_members)
-        ns['functions'], ns['all_functions'] = \
-            get_members(obj, {'function'}, imported=imported_members)
-        ns['classes'], ns['all_classes'] = \
-            get_members(obj, {'class'}, imported=imported_members)
-        ns['exceptions'], ns['all_exceptions'] = \
-            get_members(obj, {'exception'}, imported=imported_members)
-        ns['attributes'], ns['all_attributes'] = \
-            get_module_attrs(ns['members'])
-        ispackage = hasattr(obj, '__path__')
-        if ispackage and recursive:
-            ns['modules'], ns['all_modules'] = get_modules(obj)
-    elif doc.objtype == 'class':
-        ns['members'] = dir(obj)
-        ns['inherited_members'] = \
-            set(dir(obj)) - set(obj.__dict__.keys())
-        ns['methods'], ns['all_methods'] = \
-            get_members(obj, {'method'}, ['__init__'])
-        ns['attributes'], ns['all_attributes'] = \
-            get_members(obj, {'attribute', 'property'})
-
-    if modname is None or qualname is None:
-        modname, qualname = split_full_qualified_name(name)
-
-    if doc.objtype in ('method', 'attribute', 'property'):
-        ns['class'] = qualname.rsplit(".", 1)[0]
-
-    if doc.objtype in ('class',):
-        shortname = qualname
-    else:
-        shortname = qualname.rsplit(".", 1)[-1]
-
-    ns['fullname'] = name
-    ns['module'] = modname
-    ns['objname'] = qualname
-    ns['name'] = shortname
-
-    ns['objtype'] = doc.objtype
-    ns['underline'] = len(name) * '='
-
-    if template_name:
-        return template.render(template_name, ns)
-    else:
-        return template.render(doc.objtype, ns)
-
-
-def generate_autosummary_docs(sources: List[str], output_dir: str = None,
-                              suffix: str = '.rst', base_path: str = None,
-                              builder: Builder = None, template_dir: str = None,
-                              imported_members: bool = False, app: Any = None,
-                              overwrite: bool = True, encoding: str = 'utf-8') -> None:
-
-    if builder:
-        warnings.warn('builder argument for generate_autosummary_docs() is deprecated.',
-                      RemovedInSphinx50Warning, stacklevel=2)
-
-    if template_dir:
-        warnings.warn('template_dir argument for generate_autosummary_docs() is deprecated.',
-                      RemovedInSphinx50Warning, stacklevel=2)
-
-    showed_sources = list(sorted(sources))
-    if len(showed_sources) > 20:
-        showed_sources = showed_sources[:10] + ['...'] + showed_sources[-10:]
-    logger.info(__('[autosummary] generating autosummary for: %s') %
-                ', '.join(showed_sources))
-
-    if output_dir:
-        logger.info(__('[autosummary] writing to %s') % output_dir)
-
-    if base_path is not None:
-        sources = [os.path.join(base_path, filename) for filename in sources]
-
-    template = AutosummaryRenderer(app)
-
-    # read
-    items = find_autosummary_in_files(sources)
-
-    # keep track of new files
-    new_files = []
-
-    if app:
-        filename_map = app.config.autosummary_filename_map
-    else:
-        filename_map = {}
-
-    # write
-    for entry in sorted(set(items), key=str):
-        if entry.path is None:
-            # The corresponding autosummary:: directive did not have
-            # a :toctree: option
-            continue
-
-        path = output_dir or os.path.abspath(entry.path)
-        ensuredir(path)
-
-        try:
-            name, obj, parent, modname = import_by_name(entry.name, grouped_exception=True)
-            qualname = name.replace(modname + ".", "")
-        except ImportExceptionGroup as exc:
-            try:
-                # try to import as an instance attribute
-                name, obj, parent, modname = import_ivar_by_name(entry.name)
-                qualname = name.replace(modname + ".", "")
-            except ImportError as exc2:
-                if exc2.__cause__:
-                    exceptions: List[BaseException] = exc.exceptions + [exc2.__cause__]
-                else:
-                    exceptions = exc.exceptions + [exc2]
-
-                errors = list(set("* %s: %s" % (type(e).__name__, e) for e in exceptions))
-                logger.warning(__('[autosummary] failed to import %s.\nPossible hints:\n%s'),
-                               entry.name, '\n'.join(errors))
-                continue
-
-        context: Dict[str, Any] = {}
-        if app:
-            context.update(app.config.autosummary_context)
-
-        content = generate_autosummary_content(name, obj, parent, template, entry.template,
-                                               imported_members, app, entry.recursive, context,
-                                               modname, qualname)
-        try:
-            py_source_rel = get_full_modname(modname, qualname).replace('.', '/') + '.py'
-        except:
-            logger.warning(name)
-            py_source_rel = ''
-
-        re_view = f"\n.. image:: https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/{app.config.docs_branch}/" + \
-          f"resource/_static/logo_source_en.svg\n    :target: " + app.config.giturl + \
-          f"{app.config.copy_repo}/blob/{app.config.branch}/" + app.config.repo_whl + \
-          py_source_rel.split(app.config.cst_module_name)[-1] + '\n    :alt: View Source On Gitee\n\n'
-
-        if re_view not in content and py_source_rel:
-            content = re.sub('([=]{5,})\n', r'\1\n' + re_view, content, 1)
-        filename = os.path.join(path, filename_map.get(name, name) + suffix)
-        if os.path.isfile(filename):
-            with open(filename, encoding=encoding) as f:
-                old_content = f.read()
-
-            if content == old_content:
-                continue
-            elif overwrite:  # content has changed
-                with open(filename, 'w', encoding=encoding) as f:
-                    f.write(content)
-                new_files.append(filename)
-        else:
-            with open(filename, 'w', encoding=encoding) as f:
-                f.write(content)
-            new_files.append(filename)
-
-    # descend recursively to new files
-    if new_files:
-        generate_autosummary_docs(new_files, output_dir=output_dir,
-                                  suffix=suffix, base_path=base_path,
-                                  builder=builder, template_dir=template_dir,
-                                  imported_members=imported_members, app=app,
-                                  overwrite=overwrite)
-
-
-# -- Finding documented entries in files ---------------------------------------
-
-def find_autosummary_in_files(filenames: List[str]) -> List[AutosummaryEntry]:
-    """Find out what items are documented in source/*.rst.
-
-    See `find_autosummary_in_lines`.
-    """
-    documented: List[AutosummaryEntry] = []
-    for filename in filenames:
-        with open(filename, encoding='utf-8', errors='ignore') as f:
-            lines = f.read().splitlines()
-            documented.extend(find_autosummary_in_lines(lines, filename=filename))
-    return documented
-
-
-def find_autosummary_in_docstring(name: str, module: str = None, filename: str = None
-                                  ) -> List[AutosummaryEntry]:
-    """Find out what items are documented in the given object's docstring.
-
-    See `find_autosummary_in_lines`.
-    """
-    if module:
-        warnings.warn('module argument for find_autosummary_in_docstring() is deprecated.',
-                      RemovedInSphinx50Warning, stacklevel=2)
-
-    try:
-        real_name, obj, parent, modname = import_by_name(name, grouped_exception=True)
-        lines = pydoc.getdoc(obj).splitlines()
-        return find_autosummary_in_lines(lines, module=name, filename=filename)
-    except AttributeError:
-        pass
-    except ImportExceptionGroup as exc:
-        errors = list(set("* %s: %s" % (type(e).__name__, e) for e in exc.exceptions))
-        print('Failed to import %s.\nPossible hints:\n%s' % (name, '\n'.join(errors)))
-    except SystemExit:
-        print("Failed to import '%s'; the module executes module level "
-              "statement and it might call sys.exit()." % name)
-    return []
-
-
-def find_autosummary_in_lines(lines: List[str], module: str = None, filename: str = None
-                              ) -> List[AutosummaryEntry]:
-    """Find out what items appear in autosummary:: directives in the
-    given lines.
-
-    Returns a list of (name, toctree, template) where *name* is a name
-    of an object and *toctree* the :toctree: path of the corresponding
-    autosummary directive (relative to the root of the file name), and
-    *template* the value of the :template: option. *toctree* and
-    *template* ``None`` if the directive does not have the
-    corresponding options set.
-    """
-    autosummary_re = re.compile(r'^(\s*)\.\.\s+(ms[a-z]*)?autosummary::\s*')
-    automodule_re = re.compile(
-        r'^\s*\.\.\s+automodule::\s*([A-Za-z0-9_.]+)\s*$')
-    module_re = re.compile(
-        r'^\s*\.\.\s+(current)?module::\s*([a-zA-Z0-9_.]+)\s*$')
-    autosummary_item_re = re.compile(r'^\s+(~?[_a-zA-Z][a-zA-Z0-9_.]*)\s*.*?')
-    recursive_arg_re = re.compile(r'^\s+:recursive:\s*$')
-    toctree_arg_re = re.compile(r'^\s+:toctree:\s*(.*?)\s*$')
-    template_arg_re = re.compile(r'^\s+:template:\s*(.*?)\s*$')
-
-    documented: List[AutosummaryEntry] = []
-
-    recursive = False
-    toctree: str = None
-    template = None
-    current_module = module
-    in_autosummary = False
-    base_indent = ""
-
-    for line in lines:
-        if in_autosummary:
-            m = recursive_arg_re.match(line)
-            if m:
-                recursive = True
-                continue
-
-            m = toctree_arg_re.match(line)
-            if m:
-                toctree = m.group(1)
-                if filename:
-                    toctree = os.path.join(os.path.dirname(filename),
-                                           toctree)
-                continue
-
-            m = template_arg_re.match(line)
-            if m:
-                template = m.group(1).strip()
-                continue
-
-            if line.strip().startswith(':'):
-                continue  # skip options
-
-            m = autosummary_item_re.match(line)
-            if m:
-                name = m.group(1).strip()
-                if name.startswith('~'):
-                    name = name[1:]
-                if current_module and \
-                   not name.startswith(current_module + '.'):
-                    name = "%s.%s" % (current_module, name)
-                documented.append(AutosummaryEntry(name, toctree, template, recursive))
-                continue
-
-            if not line.strip() or line.startswith(base_indent + " "):
-                continue
-
-            in_autosummary = False
-
-        m = autosummary_re.match(line)
-        if m:
-            in_autosummary = True
-            base_indent = m.group(1)
-            recursive = False
-            toctree = None
-            template = None
-            continue
-
-        m = automodule_re.search(line)
-        if m:
-            current_module = m.group(1).strip()
-            # recurse into the automodule docstring
-            documented.extend(find_autosummary_in_docstring(
-                current_module, filename=filename))
-            continue
-
-        m = module_re.match(line)
-        if m:
-            current_module = m.group(2)
-            continue
-
-    return documented
-
-
-def get_parser() -> argparse.ArgumentParser:
-    parser = argparse.ArgumentParser(
-        usage='%(prog)s [OPTIONS] <SOURCE_FILE>...',
-        epilog=__('For more information, visit <https://www.sphinx-doc.org/>.'),
-        description=__("""
-Generate ReStructuredText using autosummary directives.
-
-sphinx-autogen is a frontend to sphinx.ext.autosummary.generate. It generates
-the reStructuredText files from the autosummary directives contained in the
-given input files.
-
-The format of the autosummary directive is documented in the
-``sphinx.ext.autosummary`` Python module and can be read using::
-
-  pydoc sphinx.ext.autosummary
-"""))
-
-    parser.add_argument('--version', action='version', dest='show_version',
-                        version='%%(prog)s %s' % __display_version__)
-
-    parser.add_argument('source_file', nargs='+',
-                        help=__('source files to generate rST files for'))
-
-    parser.add_argument('-o', '--output-dir', action='store',
-                        dest='output_dir',
-                        help=__('directory to place all output in'))
-    parser.add_argument('-s', '--suffix', action='store', dest='suffix',
-                        default='rst',
-                        help=__('default suffix for files (default: '
-                                '%(default)s)'))
-    parser.add_argument('-t', '--templates', action='store', dest='templates',
-                        default=None,
-                        help=__('custom template directory (default: '
-                                '%(default)s)'))
-    parser.add_argument('-i', '--imported-members', action='store_true',
-                        dest='imported_members', default=False,
-                        help=__('document imported members (default: '
-                                '%(default)s)'))
-    parser.add_argument('-a', '--respect-module-all', action='store_true',
-                        dest='respect_module_all', default=False,
-                        help=__('document exactly the members in module __all__ attribute. '
-                                '(default: %(default)s)'))
-
-    return parser
-
-
-def main(argv: List[str] = sys.argv[1:]) -> None:
-    sphinx.locale.setlocale(locale.LC_ALL, '')
-    sphinx.locale.init_console(os.path.join(package_dir, 'locale'), 'sphinx')
-    translator, _ = sphinx.locale.init([], None)
-
-    app = DummyApplication(translator)
-    logging.setup(app, sys.stdout, sys.stderr)  # type: ignore
-    setup_documenters(app)
-    args = get_parser().parse_args(argv)
-
-    if args.templates:
-        app.config.templates_path.append(path.abspath(args.templates))
-    app.config.autosummary_ignore_module_all = not args.respect_module_all  # type: ignore
-
-    generate_autosummary_docs(args.source_file, args.output_dir,
-                              '.' + args.suffix,
-                              imported_members=args.imported_members,
-                              app=app)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/docs/mindformers/docs/_ext/overwriteobjectiondirective.txt b/docs/mindformers/docs/_ext/overwriteobjectiondirective.txt
deleted file mode 100644
index 4555a4e95729689c5136a130537ea29f29d46403..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/_ext/overwriteobjectiondirective.txt
+++ /dev/null
@@ -1,436 +0,0 @@
-"""
-    sphinx.directives
-    ~~~~~~~~~~~~~~~~~
-
-    Handlers for additional ReST directives.
-
-    :copyright: Copyright 2007-2022 by the Sphinx team, see AUTHORS.
-    :license: BSD, see LICENSE for details.
-"""
-
-import re
-import inspect
-import importlib
-from typing import TYPE_CHECKING, Any, Dict, Generic, List, Tuple, TypeVar, cast
-
-from docutils import nodes
-from docutils.nodes import Node
-from docutils.parsers.rst import directives, roles
-
-from sphinx import addnodes
-from sphinx.addnodes import desc_signature
-from sphinx.deprecation import RemovedInSphinx50Warning, deprecated_alias
-from sphinx.util import docutils, logging
-from sphinx.util.docfields import DocFieldTransformer, Field, TypedField
-from sphinx.util.docutils import SphinxDirective
-from sphinx.util.typing import OptionSpec
-
-if TYPE_CHECKING:
-    from sphinx.application import Sphinx
-
-
-# RE to strip backslash escapes
-nl_escape_re = re.compile(r'\\\n')
-strip_backslash_re = re.compile(r'\\(.)')
-
-T = TypeVar('T')
-logger = logging.getLogger(__name__)
-
-def optional_int(argument: str) -> int:
-    """
-    Check for an integer argument or None value; raise ``ValueError`` if not.
-    """
-    if argument is None:
-        return None
-    else:
-        value = int(argument)
-        if value < 0:
-            raise ValueError('negative value; must be positive or zero')
-        return value
-
-def get_api(fullname):
-    try:
-        module_name, api_name= ".".join(fullname.split('.')[:-1]), fullname.split('.')[-1]
-        module_import = importlib.import_module(module_name)
-    except ModuleNotFoundError:
-        module_name, api_name = ".".join(fullname.split('.')[:-2]), ".".join(fullname.split('.')[-2:])
-        module_import = importlib.import_module(module_name)
-    api = eval(f"module_import.{api_name}")
-    return api
-
-def get_example(name: str):
-    try:
-        api_doc = inspect.getdoc(get_api(name))
-        example_str = re.findall(r'Examples:\n([\w\W]*?)(\n\n|$)', api_doc)
-        if not example_str:
-            return []
-        if '.. note::' in example_str[0][0]:
-            api_doc = re.sub(r'Examples:\n    \.\. note::(?:.|\n)*?    >>>', r'Examples:\n    >>>', api_doc)
-            example_str = re.findall(r'(?<!Tutorial )Examples:\n([\w\W]*?)(\n\n|$)', api_doc)
-            example_str = re.sub(r'\n\s+', r'\n', example_str[0][0])
-            example_str = example_str.strip()
-            example_list = example_str.split('\n')
-            return [""] + example_list + [""]
-        example_str = re.sub(r'\n\s+', r'\n', example_str[0][0])
-        example_str = example_str.strip()
-        example_list = example_str.split('\n')
-        return ["", "**样例：**", ""] + example_list + [""]
-    except:
-        return []
-
-def get_platforms(name: str):
-    try:
-        api_doc = inspect.getdoc(get_api(name))
-        example_str = re.findall(r'Supported Platforms:\n\s+(.*?)\n\n', api_doc)
-        if not example_str:
-            example_str_leak = re.findall(r'Supported Platforms:\n\s+(.*)', api_doc)
-            if example_str_leak:
-                example_str = example_str_leak[0].strip()
-                example_list = example_str.split('\n')
-                example_list = ['    ' + example_list[0]]
-                return ["", "支持平台："] + example_list + [""]
-            return []
-        example_str = example_str[0].strip()
-        example_list = example_str.split('\n')
-        example_list = ['    ' + example_list[0]]
-        return ["", "支持平台："] + example_list + [""]
-    except:
-        return []
-
-class ObjectDescription(SphinxDirective, Generic[T]):
-    """
-    Directive to describe a class, function or similar object.  Not used
-    directly, but subclassed (in domain-specific directives) to add custom
-    behavior.
-    """
-
-    has_content = True
-    required_arguments = 1
-    optional_arguments = 0
-    final_argument_whitespace = True
-    option_spec: OptionSpec = {
-        'noindex': directives.flag,
-    }  # type: Dict[str, DirectiveOption]
-
-    # types of doc fields that this directive handles, see sphinx.util.docfields
-    doc_field_types: List[Field] = []
-    domain: str = None
-    objtype: str = None
-    indexnode: addnodes.index = None
-
-    # Warning: this might be removed in future version. Don't touch this from extensions.
-    _doc_field_type_map = {}  # type: Dict[str, Tuple[Field, bool]]
-
-    def get_field_type_map(self) -> Dict[str, Tuple[Field, bool]]:
-        if self._doc_field_type_map == {}:
-            self._doc_field_type_map = {}
-            for field in self.doc_field_types:
-                for name in field.names:
-                    self._doc_field_type_map[name] = (field, False)
-
-                if field.is_typed:
-                    typed_field = cast(TypedField, field)
-                    for name in typed_field.typenames:
-                        self._doc_field_type_map[name] = (field, True)
-
-        return self._doc_field_type_map
-
-    def get_signatures(self) -> List[str]:
-        """
-        Retrieve the signatures to document from the directive arguments.  By
-        default, signatures are given as arguments, one per line.
-
-        Backslash-escaping of newlines is supported.
-        """
-        lines = nl_escape_re.sub('', self.arguments[0]).split('\n')
-        if self.config.strip_signature_backslash:
-            # remove backslashes to support (dummy) escapes; helps Vim highlighting
-            return [strip_backslash_re.sub(r'\1', line.strip()) for line in lines]
-        else:
-            return [line.strip() for line in lines]
-
-    def handle_signature(self, sig: str, signode: desc_signature) -> Any:
-        """
-        Parse the signature *sig* into individual nodes and append them to
-        *signode*. If ValueError is raised, parsing is aborted and the whole
-        *sig* is put into a single desc_name node.
-
-        The return value should be a value that identifies the object.  It is
-        passed to :meth:`add_target_and_index()` unchanged, and otherwise only
-        used to skip duplicates.
-        """
-        raise ValueError
-
-    def add_target_and_index(self, name: Any, sig: str, signode: desc_signature) -> None:
-        """
-        Add cross-reference IDs and entries to self.indexnode, if applicable.
-
-        *name* is whatever :meth:`handle_signature()` returned.
-        """
-        return  # do nothing by default
-
-    def before_content(self) -> None:
-        """
-        Called before parsing content. Used to set information about the current
-        directive context on the build environment.
-        """
-        pass
-
-    def transform_content(self, contentnode: addnodes.desc_content) -> None:
-        """
-        Called after creating the content through nested parsing,
-        but before the ``object-description-transform`` event is emitted,
-        and before the info-fields are transformed.
-        Can be used to manipulate the content.
-        """
-        pass
-
-    def after_content(self) -> None:
-        """
-        Called after parsing content. Used to reset information about the
-        current directive context on the build environment.
-        """
-        pass
-
-    def check_class_end(self, content):
-        for i in content:
-            if not i.startswith('.. include::') and i != "\n" and i != "":
-                return False
-        return True
-
-    def extend_items(self, rst_file, start_num, num):
-        ls = []
-        for i in range(1, num+1):
-            ls.append((rst_file, start_num+i))
-        return ls
-
-    def run(self) -> List[Node]:
-        """
-        Main directive entry function, called by docutils upon encountering the
-        directive.
-
-        This directive is meant to be quite easily subclassable, so it delegates
-        to several additional methods.  What it does:
-
-        * find out if called as a domain-specific directive, set self.domain
-        * create a `desc` node to fit all description inside
-        * parse standard options, currently `noindex`
-        * create an index node if needed as self.indexnode
-        * parse all given signatures (as returned by self.get_signatures())
-          using self.handle_signature(), which should either return a name
-          or raise ValueError
-        * add index entries using self.add_target_and_index()
-        * parse the content and handle doc fields in it
-        """
-        if ':' in self.name:
-            self.domain, self.objtype = self.name.split(':', 1)
-        else:
-            self.domain, self.objtype = '', self.name
-        self.indexnode = addnodes.index(entries=[])
-
-        node = addnodes.desc()
-        node.document = self.state.document
-        node['domain'] = self.domain
-        # 'desctype' is a backwards compatible attribute
-        node['objtype'] = node['desctype'] = self.objtype
-        node['noindex'] = noindex = ('noindex' in self.options)
-        if self.domain:
-            node['classes'].append(self.domain)
-        node['classes'].append(node['objtype'])
-
-        self.names: List[T] = []
-        signatures = self.get_signatures()
-        for sig in signatures:
-            # add a signature node for each signature in the current unit
-            # and add a reference target for it
-            signode = addnodes.desc_signature(sig, '')
-            self.set_source_info(signode)
-            node.append(signode)
-            try:
-                # name can also be a tuple, e.g. (classname, objname);
-                # this is strictly domain-specific (i.e. no assumptions may
-                # be made in this base class)
-                name = self.handle_signature(sig, signode)
-            except ValueError:
-                # signature parsing failed
-                signode.clear()
-                signode += addnodes.desc_name(sig, sig)
-                continue  # we don't want an index entry here
-            if name not in self.names:
-                self.names.append(name)
-                if not noindex:
-                    # only add target and index entry if this is the first
-                    # description of the object with this name in this desc block
-                    self.add_target_and_index(name, sig, signode)
-
-        contentnode = addnodes.desc_content()
-        node.append(contentnode)
-        if self.names:
-            # needed for association of version{added,changed} directives
-            self.env.temp_data['object'] = self.names[0]
-        self.before_content()
-        try:
-            example = get_example(self.names[0][0])
-            platforms = get_platforms(self.names[0][0])
-        except Exception as e:
-            example = ''
-            platforms = ''
-            logger.warning(f'Error API names in {self.arguments[0]}.')
-            logger.warning(f'{e}')
-        extra = platforms + example
-        if "**样例：**" not in example and example:
-            try:
-                if self.objtype == "method":
-                    index_platforms = 0
-                    for num, i in enumerate(self.content.data):
-                        if i.startswith('样例：'):
-                            index_platforms = num
-                            break
-                    if index_platforms and platforms:
-                        self.content.data[index_platforms] = '**样例：**'
-                        self.content.data.insert(index_platforms+1, '')
-                        count = len(self.content.data)
-                        for i in platforms:
-                            self.content.data.insert(index_platforms-count, i)
-                    else:
-                        self.content.data[index_platforms] = '**样例：**'
-                        self.content.data.insert(index_platforms+1, '')
-                    self.content.data.extend(example)
-                else:
-                    index_num = 0
-                    index_platforms = 0
-                    for num, i in enumerate(self.content.data):
-                        if i.startswith('.. py:method::') or self.check_class_end(self.content.data[num:]):
-                            index_num = num
-                            break
-                    if index_num:
-                        for num, j in enumerate(self.content.data[:index_num]):
-                            if j.startswith('样例：'):
-                                index_platforms = num
-                                break
-                        if index_platforms and platforms:
-                            self.content.data[index_platforms] = '**样例：**'
-                            self.content.data.insert(index_platforms+1, '')
-                            count = len(self.content.data)
-                            for k in platforms:
-                                self.content.data.insert(index_platforms-count, k)
-                        else:
-                            self.content.data[index_platforms] = '**样例：**'
-                            self.content.data.insert(index_platforms+1, '')
-                        count = len(self.content.data)
-                        count_plat = len(platforms)
-                        for i in example:
-                            self.content.data.insert(index_num-count+count_plat, i)
-                    else:
-                        index_platforms = 0
-                        for num, i in enumerate(self.content.data):
-                            if i.startswith('样例：'):
-                                index_platforms = num
-                                break
-                        if index_platforms and platforms:
-                            self.content.data[index_platforms] = '**样例：**'
-                            self.content.data.insert(index_platforms+1, '')
-                            count = len(self.content.data)
-                            for i in platforms:
-                                self.content.data.insert(index_platforms-count, i)
-                        else:
-                            self.content.data[index_platforms] = '**样例：**'
-                            self.content.data.insert(index_platforms+1, '')
-                        self.content.data.extend(example)
-            except Exception as e:
-                logger.warning(e)
-        elif extra:
-            if self.objtype == "method":
-                self.content.data.extend(extra)
-            else:
-                index_num = 0
-                for num, i in enumerate(self.content.data):
-                    if i.startswith('.. py:method::') or self.check_class_end(self.content.data[num:]):
-                        index_num = num
-                        break
-                if index_num:
-                    count = len(self.content.data)
-                    for i in extra:
-                        self.content.data.insert(index_num-count, i)
-                else:
-                    self.content.data.extend(extra)
-        try:
-            self.content.items.extend(self.extend_items(self.content.items[0][0], self.content.items[-1][1], len(extra)))
-        except Exception as e:
-            logger.warning(f'{e}')
-        self.state.nested_parse(self.content, self.content_offset, contentnode)
-        self.transform_content(contentnode)
-        self.env.app.emit('object-description-transform',
-                          self.domain, self.objtype, contentnode)
-        DocFieldTransformer(self).transform_all(contentnode)
-        self.env.temp_data['object'] = None
-        self.after_content()
-        return [self.indexnode, node]
-
-
-class DefaultRole(SphinxDirective):
-    """
-    Set the default interpreted text role.  Overridden from docutils.
-    """
-
-    optional_arguments = 1
-    final_argument_whitespace = False
-
-    def run(self) -> List[Node]:
-        if not self.arguments:
-            docutils.unregister_role('')
-            return []
-        role_name = self.arguments[0]
-        role, messages = roles.role(role_name, self.state_machine.language,
-                                    self.lineno, self.state.reporter)
-        if role:
-            docutils.register_role('', role)
-            self.env.temp_data['default_role'] = role_name
-        else:
-            literal_block = nodes.literal_block(self.block_text, self.block_text)
-            reporter = self.state.reporter
-            error = reporter.error('Unknown interpreted text role "%s".' % role_name,
-                                   literal_block, line=self.lineno)
-            messages += [error]
-
-        return cast(List[nodes.Node], messages)
-
-
-class DefaultDomain(SphinxDirective):
-    """
-    Directive to (re-)set the default domain for this source file.
-    """
-
-    has_content = False
-    required_arguments = 1
-    optional_arguments = 0
-    final_argument_whitespace = False
-    option_spec = {}  # type: Dict
-
-    def run(self) -> List[Node]:
-        domain_name = self.arguments[0].lower()
-        # if domain_name not in env.domains:
-        #     # try searching by label
-        #     for domain in env.domains.values():
-        #         if domain.label.lower() == domain_name:
-        #             domain_name = domain.name
-        #             break
-        self.env.temp_data['default_domain'] = self.env.domains.get(domain_name)
-        return []
-
-def setup(app: "Sphinx") -> Dict[str, Any]:
-    app.add_config_value("strip_signature_backslash", False, 'env')
-    directives.register_directive('default-role', DefaultRole)
-    directives.register_directive('default-domain', DefaultDomain)
-    directives.register_directive('describe', ObjectDescription)
-    # new, more consistent, name
-    directives.register_directive('object', ObjectDescription)
-
-    app.add_event('object-description-transform')
-
-    return {
-        'version': 'builtin',
-        'parallel_read_safe': True,
-        'parallel_write_safe': True,
-    }
-
diff --git a/docs/mindformers/docs/_ext/overwriteviewcode.txt b/docs/mindformers/docs/_ext/overwriteviewcode.txt
deleted file mode 100644
index 172780ec56b3ed90e7b0add617257a618cf38ee0..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/_ext/overwriteviewcode.txt
+++ /dev/null
@@ -1,378 +0,0 @@
-"""
-    sphinx.ext.viewcode
-    ~~~~~~~~~~~~~~~~~~~
-
-    Add links to module code in Python object descriptions.
-
-    :copyright: Copyright 2007-2022 by the Sphinx team, see AUTHORS.
-    :license: BSD, see LICENSE for details.
-"""
-
-import posixpath
-import traceback
-import warnings
-from os import path
-from typing import Any, Dict, Generator, Iterable, Optional, Set, Tuple, cast
-
-from docutils import nodes
-from docutils.nodes import Element, Node
-
-import sphinx
-from sphinx import addnodes
-from sphinx.application import Sphinx
-from sphinx.builders import Builder
-from sphinx.builders.html import StandaloneHTMLBuilder
-from sphinx.deprecation import RemovedInSphinx50Warning
-from sphinx.environment import BuildEnvironment
-from sphinx.locale import _, __
-from sphinx.pycode import ModuleAnalyzer
-from sphinx.transforms.post_transforms import SphinxPostTransform
-from sphinx.util import get_full_modname, logging, status_iterator
-from sphinx.util.nodes import make_refnode
-
-
-logger = logging.getLogger(__name__)
-
-
-OUTPUT_DIRNAME = '_modules'
-
-
-class viewcode_anchor(Element):
-    """Node for viewcode anchors.
-
-    This node will be processed in the resolving phase.
-    For viewcode supported builders, they will be all converted to the anchors.
-    For not supported builders, they will be removed.
-    """
-
-
-def _get_full_modname(app: Sphinx, modname: str, attribute: str) -> Optional[str]:
-    try:
-        return get_full_modname(modname, attribute)
-    except AttributeError:
-        # sphinx.ext.viewcode can't follow class instance attribute
-        # then AttributeError logging output only verbose mode.
-        logger.verbose('Didn\'t find %s in %s', attribute, modname)
-        return None
-    except Exception as e:
-        # sphinx.ext.viewcode follow python domain directives.
-        # because of that, if there are no real modules exists that specified
-        # by py:function or other directives, viewcode emits a lot of warnings.
-        # It should be displayed only verbose mode.
-        logger.verbose(traceback.format_exc().rstrip())
-        logger.verbose('viewcode can\'t import %s, failed with error "%s"', modname, e)
-        return None
-
-
-def is_supported_builder(builder: Builder) -> bool:
-    if builder.format != 'html':
-        return False
-    elif builder.name == 'singlehtml':
-        return False
-    elif builder.name.startswith('epub') and not builder.config.viewcode_enable_epub:
-        return False
-    else:
-        return True
-
-
-def doctree_read(app: Sphinx, doctree: Node) -> None:
-    env = app.builder.env
-    if not hasattr(env, '_viewcode_modules'):
-        env._viewcode_modules = {}  # type: ignore
-
-    def has_tag(modname: str, fullname: str, docname: str, refname: str) -> bool:
-        entry = env._viewcode_modules.get(modname, None)  # type: ignore
-        if entry is False:
-            return False
-
-        code_tags = app.emit_firstresult('viewcode-find-source', modname)
-        if code_tags is None:
-            try:
-                analyzer = ModuleAnalyzer.for_module(modname)
-                analyzer.find_tags()
-            except Exception:
-                env._viewcode_modules[modname] = False  # type: ignore
-                return False
-
-            code = analyzer.code
-            tags = analyzer.tags
-        else:
-            code, tags = code_tags
-
-        if entry is None or entry[0] != code:
-            entry = code, tags, {}, refname
-            env._viewcode_modules[modname] = entry  # type: ignore
-        _, tags, used, _ = entry
-        if fullname in tags:
-            used[fullname] = docname
-            return True
-
-        return False
-
-    for objnode in list(doctree.findall(addnodes.desc)):
-        if objnode.get('domain') != 'py':
-            continue
-        names: Set[str] = set()
-        for signode in objnode:
-            if not isinstance(signode, addnodes.desc_signature):
-                continue
-            modname = signode.get('module')
-            fullname = signode.get('fullname')
-            try:
-                if fullname and modname==None:
-                    if fullname.split('.')[-1].lower() == fullname.split('.')[-1] and fullname.split('.')[-2].lower() != fullname.split('.')[-2]:
-                        modname = '.'.join(fullname.split('.')[:-2])
-                        fullname = '.'.join(fullname.split('.')[-2:])
-                    else:
-                        modname = '.'.join(fullname.split('.')[:-1])
-                        fullname = fullname.split('.')[-1]
-                fullname_new = fullname
-            except Exception:
-                logger.warning(f'error_modename:{modname}')
-                logger.warning(f'error_fullname:{fullname}')
-            refname = modname
-            if env.config.viewcode_follow_imported_members:
-                new_modname = app.emit_firstresult(
-                    'viewcode-follow-imported', modname, fullname,
-                )
-                if not new_modname:
-                    new_modname = _get_full_modname(app, modname, fullname)
-                modname = new_modname
-            # logger.warning(f'new_modename:{modname}')
-            if not modname:
-                continue
-            # fullname = signode.get('fullname')
-            # if fullname and modname==None:
-            fullname = fullname_new
-            if not has_tag(modname, fullname, env.docname, refname):
-                continue
-            if fullname in names:
-                # only one link per name, please
-                continue
-            names.add(fullname)
-            pagename = posixpath.join(OUTPUT_DIRNAME, modname.replace('.', '/'))
-            signode += viewcode_anchor(reftarget=pagename, refid=fullname, refdoc=env.docname)
-
-
-def env_merge_info(app: Sphinx, env: BuildEnvironment, docnames: Iterable[str],
-                   other: BuildEnvironment) -> None:
-    if not hasattr(other, '_viewcode_modules'):
-        return
-    # create a _viewcode_modules dict on the main environment
-    if not hasattr(env, '_viewcode_modules'):
-        env._viewcode_modules = {}  # type: ignore
-    # now merge in the information from the subprocess
-    for modname, entry in other._viewcode_modules.items():  # type: ignore
-        if modname not in env._viewcode_modules:  # type: ignore
-            env._viewcode_modules[modname] = entry  # type: ignore
-        else:
-            if env._viewcode_modules[modname]:  # type: ignore
-                used = env._viewcode_modules[modname][2]  # type: ignore
-                for fullname, docname in entry[2].items():
-                    if fullname not in used:
-                        used[fullname] = docname
-
-
-def env_purge_doc(app: Sphinx, env: BuildEnvironment, docname: str) -> None:
-    modules = getattr(env, '_viewcode_modules', {})
-
-    for modname, entry in list(modules.items()):
-        if entry is False:
-            continue
-
-        code, tags, used, refname = entry
-        for fullname in list(used):
-            if used[fullname] == docname:
-                used.pop(fullname)
-
-        if len(used) == 0:
-            modules.pop(modname)
-
-
-class ViewcodeAnchorTransform(SphinxPostTransform):
-    """Convert or remove viewcode_anchor nodes depends on builder."""
-    default_priority = 100
-
-    def run(self, **kwargs: Any) -> None:
-        if is_supported_builder(self.app.builder):
-            self.convert_viewcode_anchors()
-        else:
-            self.remove_viewcode_anchors()
-
-    def convert_viewcode_anchors(self) -> None:
-        for node in self.document.findall(viewcode_anchor):
-            anchor = nodes.inline('', _('[源代码]'), classes=['viewcode-link'])
-            refnode = make_refnode(self.app.builder, node['refdoc'], node['reftarget'],
-                                   node['refid'], anchor)
-            node.replace_self(refnode)
-
-    def remove_viewcode_anchors(self) -> None:
-        for node in list(self.document.findall(viewcode_anchor)):
-            node.parent.remove(node)
-
-
-def missing_reference(app: Sphinx, env: BuildEnvironment, node: Element, contnode: Node
-                      ) -> Optional[Node]:
-    # resolve our "viewcode" reference nodes -- they need special treatment
-    if node['reftype'] == 'viewcode':
-        warnings.warn('viewcode extension is no longer use pending_xref node. '
-                      'Please update your extension.', RemovedInSphinx50Warning)
-        return make_refnode(app.builder, node['refdoc'], node['reftarget'],
-                            node['refid'], contnode)
-
-    return None
-
-
-def get_module_filename(app: Sphinx, modname: str) -> Optional[str]:
-    """Get module filename for *modname*."""
-    source_info = app.emit_firstresult('viewcode-find-source', modname)
-    if source_info:
-        return None
-    else:
-        try:
-            filename, source = ModuleAnalyzer.get_module_source(modname)
-            return filename
-        except Exception:
-            return None
-
-
-def should_generate_module_page(app: Sphinx, modname: str) -> bool:
-    """Check generation of module page is needed."""
-    module_filename = get_module_filename(app, modname)
-    if module_filename is None:
-        # Always (re-)generate module page when module filename is not found.
-        return True
-
-    builder = cast(StandaloneHTMLBuilder, app.builder)
-    basename = modname.replace('.', '/') + builder.out_suffix
-    page_filename = path.join(app.outdir, '_modules/', basename)
-
-    try:
-        if path.getmtime(module_filename) <= path.getmtime(page_filename):
-            # generation is not needed if the HTML page is newer than module file.
-            return False
-    except IOError:
-        pass
-
-    return True
-
-
-def collect_pages(app: Sphinx) -> Generator[Tuple[str, Dict[str, Any], str], None, None]:
-    env = app.builder.env
-    if not hasattr(env, '_viewcode_modules'):
-        return
-    if not is_supported_builder(app.builder):
-        return
-    highlighter = app.builder.highlighter  # type: ignore
-    urito = app.builder.get_relative_uri
-
-    modnames = set(env._viewcode_modules)  # type: ignore
-
-    for modname, entry in status_iterator(
-            sorted(env._viewcode_modules.items()),  # type: ignore
-            __('highlighting module code... '), "blue",
-            len(env._viewcode_modules),  # type: ignore
-            app.verbosity, lambda x: x[0]):
-        if not entry:
-            continue
-        if not should_generate_module_page(app, modname):
-            continue
-
-        code, tags, used, refname = entry
-        # construct a page name for the highlighted source
-        pagename = posixpath.join(OUTPUT_DIRNAME, modname.replace('.', '/'))
-        # highlight the source using the builder's highlighter
-        if env.config.highlight_language in ('python3', 'default', 'none'):
-            lexer = env.config.highlight_language
-        else:
-            lexer = 'python'
-        highlighted = highlighter.highlight_block(code, lexer, linenos=False)
-        # split the code into lines
-        lines = highlighted.splitlines()
-        # split off wrap markup from the first line of the actual code
-        before, after = lines[0].split('<pre>')
-        lines[0:1] = [before + '<pre>', after]
-        # nothing to do for the last line; it always starts with </pre> anyway
-        # now that we have code lines (starting at index 1), insert anchors for
-        # the collected tags (HACK: this only works if the tag boundaries are
-        # properly nested!)
-        maxindex = len(lines) - 1
-        for name, docname in used.items():
-            type, start, end = tags[name]
-            backlink = urito(pagename, docname) + '#' + refname + '.' + name
-            lines[start] = (
-                '<div class="viewcode-block" id="%s"><a class="viewcode-back" '
-                'href="%s">%s</a>' % (name, backlink, _('[文档]')) +
-                lines[start])
-            lines[min(end, maxindex)] += '</div>'
-        # try to find parents (for submodules)
-        parents = []
-        parent = modname
-        while '.' in parent:
-            parent = parent.rsplit('.', 1)[0]
-            if parent in modnames:
-                parents.append({
-                    'link': urito(pagename,
-                                  posixpath.join(OUTPUT_DIRNAME, parent.replace('.', '/'))),
-                    'title': parent})
-        parents.append({'link': urito(pagename, posixpath.join(OUTPUT_DIRNAME, 'index')),
-                        'title': _('Module code')})
-        parents.reverse()
-        # putting it all together
-        context = {
-            'parents': parents,
-            'title': modname,
-            'body': (_('<h1>Source code for %s</h1>') % modname +
-                     '\n'.join(lines)),
-        }
-        yield (pagename, context, 'page.html')
-
-    if not modnames:
-        return
-
-    html = ['\n']
-    # the stack logic is needed for using nested lists for submodules
-    stack = ['']
-    for modname in sorted(modnames):
-        if modname.startswith(stack[-1]):
-            stack.append(modname + '.')
-            html.append('<ul>')
-        else:
-            stack.pop()
-            while not modname.startswith(stack[-1]):
-                stack.pop()
-                html.append('</ul>')
-            stack.append(modname + '.')
-        html.append('<li><a href="%s">%s</a></li>\n' % (
-            urito(posixpath.join(OUTPUT_DIRNAME, 'index'),
-                  posixpath.join(OUTPUT_DIRNAME, modname.replace('.', '/'))),
-            modname))
-    html.append('</ul>' * (len(stack) - 1))
-    context = {
-        'title': _('Overview: module code'),
-        'body': (_('<h1>All modules for which code is available</h1>') +
-                 ''.join(html)),
-    }
-
-    yield (posixpath.join(OUTPUT_DIRNAME, 'index'), context, 'page.html')
-
-
-def setup(app: Sphinx) -> Dict[str, Any]:
-    app.add_config_value('viewcode_import', None, False)
-    app.add_config_value('viewcode_enable_epub', False, False)
-    app.add_config_value('viewcode_follow_imported_members', True, False)
-    app.connect('doctree-read', doctree_read)
-    app.connect('env-merge-info', env_merge_info)
-    app.connect('env-purge-doc', env_purge_doc)
-    app.connect('html-collect-pages', collect_pages)
-    app.connect('missing-reference', missing_reference)
-    # app.add_config_value('viewcode_include_modules', [], 'env')
-    # app.add_config_value('viewcode_exclude_modules', [], 'env')
-    app.add_event('viewcode-find-source')
-    app.add_event('viewcode-follow-imported')
-    app.add_post_transform(ViewcodeAnchorTransform)
-    return {
-        'version': sphinx.__display_version__,
-        'env_version': 1,
-        'parallel_read_safe': True
-    }
diff --git a/docs/mindformers/docs/_ext/rename_include.py b/docs/mindformers/docs/_ext/rename_include.py
deleted file mode 100644
index bf7dea25f3ee7fd371659e80a3551439fbddee5a..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/_ext/rename_include.py
+++ /dev/null
@@ -1,60 +0,0 @@
-"""Rename .rst file to .txt file for include directive."""
-import os
-import re
-import glob
-import logging
-
-logging.basicConfig(level=logging.WARNING, format='%(message)s')
-logger = logging.getLogger(__name__)
-
-origin = "rst"
-replace = "txt"
-
-include_re = re.compile(r'\.\. include::\s+(.*?)(\.rst|\.txt)')
-include_re_sub = re.compile(rf'(\.\. include::\s+(.*?))\.{origin}')
-
-# Specified file_name lists excluded from rename procedure.
-whitepaper = ['operations.rst']
-
-def repl(matchobj):
-    """Replace functions for matched."""
-    if matchobj.group(2).split('/')[-1] + f'.{origin}' in whitepaper:
-        return matchobj.group(0)
-    return rf'{matchobj.group(1)}.{replace}'
-
-def rename_include(api_dir):
-    """
-    Rename .rst file to .txt file for include directive.
-
-    api_dir - api path relative.
-    """
-    tar = []
-    for root, _, files in os.walk(api_dir):
-        for file in files:
-            if not file.endswith('.rst'):
-                continue
-            try:
-                with open(os.path.join(root, file), 'r+', encoding='utf-8') as f:
-                    content = f.read()
-                    tar_ = include_re.findall(content)
-                    if tar_:
-                        tar_ = [i[0].split('/')[-1]+f'.{origin}' for i in tar_]
-                        tar.extend(tar_)
-                        sub = include_re_sub.findall(content)
-                        if sub:
-                            content_ = include_re_sub.sub(repl, content)
-                            f.seek(0)
-                            f.truncate()
-                            f.write(content_)
-            except UnicodeDecodeError:
-                # pylint: disable=logging-fstring-interpolation
-                logger.warning(f"UnicodeDecodeError for: {file}")
-
-    all_rst = glob.glob(f'{api_dir}/**/*.{origin}', recursive=True)
-
-    for i in all_rst:
-        if os.path.dirname(i).endswith("api_python") or os.path.basename(i) in whitepaper:
-            continue
-        name = os.path.basename(i)
-        if name in tar:
-            os.rename(i, i.replace(f'.{origin}', f'.{replace}'))
diff --git a/docs/mindformers/docs/requirements.txt b/docs/mindformers/docs/requirements.txt
deleted file mode 100644
index 46904323e583b9e0318a9b7a0a7daa23b5e2b3e5..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/requirements.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-sphinx == 4.4.0
-docutils == 0.17.1
-myst-parser == 0.18.1
-sphinx_rtd_theme == 1.0.0
-numpy
-nbsphinx == 0.8.11
-IPython
-jieba
diff --git a/docs/mindformers/docs/source_en/_templates/classtemplate.rst b/docs/mindformers/docs/source_en/_templates/classtemplate.rst
deleted file mode 100644
index 124a40fc2193248cd223e7240c80201b824a4ac5..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_en/_templates/classtemplate.rst
+++ /dev/null
@@ -1,253 +0,0 @@
-.. role:: hidden
-    :class: hidden-section
-
-.. currentmodule:: {{ module }}
-
-{% if fullname=="mindformers.AutoConfig" %}
-{{ fullname | underline }}
-
-.. autoclass:: {{ name }}
-    :exclude-members: get_config_origin_mode, get_support_list, invalid_yaml_name
-    :members:
-
-{% elif fullname=="mindformers.modules.OpParallelConfig" %}
-{{ fullname | underline }}
-
-.. autoclass:: {{ name }}
-    :exclude-members: construct, get_ulysses_cp_num, to_dict, to_diff_dict
-    :members:
-
-{% elif fullname=="mindformers.AutoProcessor" %}
-{{ fullname | underline }}
-
-.. autoclass:: {{ name }}
-    :exclude-members: from_pretrained_origin, get_support_list, invalid_yaml_name, show_support_list
-    :members:
-
-{% elif fullname=="mindformers.AutoTokenizer" %}
-{{ fullname | underline }}
-
-.. autoclass:: {{ name }}
-    :exclude-members: get_class_from_origin_mode, get_support_list, invalid_yaml_name, show_support_list
-    :members:
-
-{% elif fullname=="mindformers.core.AdamW" %}
-{{ fullname | underline }}
-
-.. autoclass:: {{ name }}
-    :exclude-members: clone_state, construct
-    :members:
-
-{% elif fullname=="mindformers.core.Came" %}
-{{ fullname | underline }}
-
-.. autoclass:: {{ name }}
-    :exclude-members: init_came_state, supports_flat_params, supports_memory_efficient_fp16, target, construct
-    :members:
-
-{% elif fullname=="mindformers.core.CheckpointMonitor" %}
-{{ fullname | underline }}
-
-.. autoclass:: {{ name }}
-    :exclude-members: record_last_ckpt_to_json, save_checkpoint, save_checkpoint_network, print_savetime, remove_redundancy
-    :members:
-
-{% elif fullname=="mindformers.core.EmF1Metric" %}
-{{ fullname | underline }}
-
-.. autoclass:: {{ name }}
-    :exclude-members: calc_em_score, calc_f1_score, evaluate_pairs, find_lcs, mixed_segmentation, remove_punctuation
-    :members:
-
-{% elif fullname=="mindformers.core.EntityScore" %}
-{{ fullname | underline }}
-
-.. autoclass:: {{ name }}
-    :exclude-members: compute, get_entities_bios
-    :members:
-
-{% elif fullname=="mindformers.core.MFLossMonitor" %}
-{{ fullname | underline }}
-
-.. autoclass:: {{ name }}
-    :exclude-members: dump_info_to_modelarts, epoch_begin, epoch_end, print_output_info, step_begin, step_end
-    :members:
-
-{% elif fullname=="mindformers.core.ProfileMonitor" %}
-{{ fullname | underline }}
-
-.. autoclass:: {{ name }}
-    :exclude-members: step_begin, step_end
-    :members:
-
-{% elif fullname=="mindformers.core.PromptAccMetric" %}
-{{ fullname | underline }}
-
-.. autoclass:: {{ name }}
-    :exclude-members: calculate_circle
-    :members:
-
-{% elif fullname=="mindformers.core.SQuADMetric" %}
-{{ fullname | underline }}
-
-.. autoclass:: {{ name }}
-    :exclude-members: clear, eval, update
-    :members:
-
-{% elif fullname=="mindformers.generation.GenerationConfig" %}
-{{ fullname | underline }}
-
-.. autoclass:: {{ name }}
-    :exclude-members: from_dict, from_model_config, to_dict, update
-    :members:
-
-{% elif fullname=="mindformers.generation.GenerationMixin" %}
-{{ fullname | underline }}
-
-.. autoclass:: {{ name }}
-    :exclude-members: add_flags_custom, get_logits_processor, get_logits_warper, prepare_inputs_for_generation, process_logits, slice_incremental_inputs, update_model_kwargs_before_generate, chunk_prefill_infer, prepare_inputs_for_generation_mcore, forward_mcore, infer_mcore, add_flags_custom_mcore
-    :members:
-
-{% elif fullname=="mindformers.models.ChatGLM2ForConditionalGeneration" %}
-{{ fullname | underline }}
-
-.. autoclass:: {{ name }}
-    :exclude-members: add_flags_custom, prepare_inputs_for_generation, prepare_inputs_for_predict_layout, construct
-    :members:
-
-{% elif fullname=="mindformers.models.ChatGLM3Tokenizer" %}
-{{ fullname | underline }}
-
-.. autoclass:: {{ name }}
-    :exclude-members: build_batch_input, build_chat_input, build_inputs_with_special_tokens, convert_tokens_to_ids, get_vocab, save_vocabulary, tokenize
-    :members:
-
-{% elif fullname=="mindformers.models.ChatGLM4Tokenizer" %}
-{{ fullname | underline }}
-
-.. autoclass:: {{ name }}
-    :exclude-members: build_batch_input, build_chat_input, build_inputs_with_special_tokens, build_single_message, convert_special_tokens_to_ids, convert_tokens_to_string, get_vocab, save_vocabulary
-    :members:
-
-{% elif fullname=="mindformers.models.LlamaForCausalLM" %}
-{{ fullname | underline }}
-
-.. autoclass:: {{ name }}
-    :exclude-members: add_flags_custom, prepare_inputs_for_predict_layout, to_embeddings, construct, prepare_inputs_for_prefill_flatten, convert_map_dict, convert_weight_dict, convert_name, pre_gather_func
-    :members:
-
-{% elif fullname=="mindformers.models.LlamaTokenizer" %}
-{{ fullname | underline }}
-
-.. autoclass:: {{ name }}
-    :exclude-members: convert_tokens_to_string, get_spm_processor, get_vocab, save_vocabulary, tokenize, vocab_size
-    :members:
-
-{% elif fullname=="mindformers.models.multi_modal.ModalContentTransformTemplate" %}
-{{ fullname | underline }}
-
-.. autoclass:: {{ name }}
-    :exclude-members: batch_input_ids, check_modal_builder_tokens, generate_modal_context_positions, stack_data, try_to_batch
-    :members:
-
-{% elif fullname=="mindformers.models.PretrainedConfig" %}
-{{ fullname | underline }}
-
-.. autoclass:: {{ name }}
-    :exclude-members: dict_ms_dtype_to_str, get_config_origin_mode, get_support_list, inverse_parse_config, register_for_auto_class, remove_type, save_config_origin_mode, show_support_list, delete_from_dict
-    :members:
-
-{% elif fullname=="mindformers.models.PreTrainedModel" %}
-{{ fullname | underline }}
-
-.. autoclass:: {{ name }}
-    :exclude-members: base_model, framework, from_pretrained_experimental_mode, from_pretrained_origin_mode, fuse_weight_from_ckpt, get_support_list, is_experimental_mode, load_checkpoint, prepare_inputs_for_predict_layout, remove_type, save_pretrained_experimental_mode, save_pretrained_origin_mode, set_dynamic_inputs, show_support_list, convert_map_dict, convert_weight_dict, convert_name, obtain_qkv_ffn_concat_keys, obtain_name_map
-    :members:
-
-{% elif fullname=="mindformers.models.PreTrainedTokenizer" %}
-{{ fullname | underline }}
-
-.. autoclass:: {{ name }}
-    :exclude-members: get_special_tokens_mask, tokenize_atom, vocab_size
-    :members:
-
-{% elif fullname=="mindformers.models.PreTrainedTokenizerFast" %}
-{{ fullname | underline }}
-
-.. autoclass:: {{ name }}
-    :exclude-members: backend_tokenizer, can_save_slow_tokenizer, decoder, init_atom_1, init_atom_2, save_vocabulary, vocab_size
-    :members:
-
-{% elif fullname=="mindformers.pet.models.LoraModel" %}
-{{ fullname | underline }}
-
-.. autoclass:: {{ name }}
-    :exclude-members: add_adapter
-    :members:
-
-{% elif fullname=="mindformers.pipeline.MultiModalToTextPipeline" %}
-{{ fullname | underline }}
-
-.. autoclass:: {{ name }}
-    :exclude-members: preprocess
-    :members:
-
-{% elif fullname=="mindformers.tools.MindFormerConfig" %}
-{{ fullname | underline }}
-
-.. autoclass:: {{ name }}
-    :exclude-members: set_value, get_value
-    :members:
-
-{% elif fullname=="mindformers.tools.register.MindFormerRegister" %}
-{{ fullname | underline }}
-
-.. autoclass:: {{ name }}
-    :exclude-members: auto_register
-    :members:
-
-{% elif fullname=="mindformers.Trainer" %}
-{{ fullname | underline }}
-
-.. autoclass:: {{ name }}
-    :exclude-members: add_callback, get_eval_dataloader, get_last_checkpoint, get_load_checkpoint, get_task_config, get_train_dataloader, init_openmind_repo, pop_callback, push_to_hub, remove_callback, save_model, set_parallel_config, set_recompute_config
-    :members:
-
-{% elif fullname=="mindformers.TrainingArguments" %}
-{{ fullname | underline }}
-
-.. autoclass:: {{ name }}
-    :exclude-members: eval_batch_size, get_device_id, get_device_num, get_rank_id, local_process_index, process_index, set_evaluate, set_push_to_hub, set_testing, to_dict, to_json_string, train_batch_size, world_size
-    :members:
-
-{% elif fullname=="mindformers.core.TrainingStateMonitor" %}
-{{ fullname | underline }}
-
-.. autoclass:: {{ name }}
-    :exclude-members: epoch_begin, epoch_end, step_begin, step_end
-    :members:
-
-{% elif fullname in ["mindformers.AutoModelForCausalLM", "mindformers.AutoModelForZeroShotImageClassification", "mindformers.AutoModel"] %}
-{{ fullname | underline }}
-
-.. autoclass:: {{ name }}
-    :exclude-members: construct
-    :members: register, from_config, from_pretrained
-
-{% elif objname[0].istitle() %}
-{{ fullname | underline }}
-
-.. autoclass:: {{ name }}
-    :exclude-members: construct
-    :members:
-
-{% else %}
-{{ fullname | underline }}
-
-.. autofunction:: {{ fullname }}
-
-{% endif %}
-
-..
-  autogenerated from _templates/classtemplate.rst
-  note it does not have :inherited-members:
diff --git a/docs/mindformers/docs/source_en/acc_optimize/acc_optimize.md b/docs/mindformers/docs/source_en/acc_optimize/acc_optimize.md
deleted file mode 100644
index 2a252cb01120ecbebc50a3eb532e6f025e6366a0..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_en/acc_optimize/acc_optimize.md
+++ /dev/null
@@ -1,492 +0,0 @@
-# Large Model Accuracy Optimization Guide
-
-[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_en/acc_optimize/acc_optimize.md)
-
-## Overview and Scenarios of Accuracy Issues
-
-### Descriptions
-
-As the Ascend AI processor (hereinafter referred to as NPU) is widely used in deep learning, the MindSpore framework, which is developed natively based on the Ascend NPU, shows better performance advantages. During large-scale cluster training, the performance improvement will greatly save users the cost of large model development. Therefore, more and more users are gradually migrating their original training models to MindSpore. However, due to the differences in hardware and framework usage, users may encounter accuracy problems after completing the model migration.
-
-This paper summarizes the common accuracy problems in the training process of large models and general accuracy problem localization methods, and seeks to help users quickly troubleshoot accuracy problems and shorten the time for model accuracy problem localization. When starting the work on large model accuracy optimization, you should have the basic knowledge of large model. To avoid dispersion, this document will not explain the basic concepts related to large models and focus on the introduction of accuracy optimization.
-
-### Categorized Summary of Common Problems
-
-Various accuracy problems often occur in large model training, and the common problems include that the loss fails to converge, the loss converges poorly, the loss fails to converge at the late stage of training, the accuracy overflows, and the loss can not be fitted to the benchmark in the process of descending. There can be a variety of reasons for these accuracy problems, including the structure of the model, the dataset, the hyperparameters, the precision of the forward and reverse computation, the calculation of the optimizer, the floating-point computational accuracy, and randomness.
-
-When accuracy problems occur, the problem can be analyzed from the reasons for these accuracy problems. A quick troubleshooting based on CheckList is performed first, followed by parameter and weight alignment, fixed randomness and turning on deterministic calculations. Then the base problem is troubleshooted, and finally the anomalous step is troubleshooted by long stable training. At the current stage, this paper mainly introduces the general method of accuracy localization for the scenarios with accuracy benchmarks, and the content of accuracy problem localization without accuracy benchmarks will be added successively.
-
-## Accuracy Problems Location CheckList
-
-Before locating the operator accuracy problem, we should first eliminate the interference of other non-operator factors. Combined with the previous precision positioning cases, the CheckList before precision positioning is summarized. In order to easier locate the problems, users can first carry out quick troubleshooting according to the CheckList.
-
-### Network Structure CheckList
-
-#### Generalized structure
-
-| **Key parameters**          | **Descriptions**            | **CheckList**    |
-| ----------------- | ------------------------- |---------------------------------|
-| num_layers        | Number of transformer layers                                              | Correspond to the Megatron num-layers parameter and check for consistency.                                             |
-| num_heads         | Number of attention heads in transformer                             | Correspond to the Megatron num-attention-heads parameter and check for consistency.                                    |
-| hidden_size       | Transformer hidden layer size                                        | Correspond to the Megatron hidden-size parameter and check for consistency.                                            |
-| intermediate_size | Feed-Forward Network hidden layer size                             | Correspond to the Megatron ffn-hidden-size parameter and check for consistency.                                        |
-| n_kv_heads        | Number of kv groups                                                     | Correspond to the Megatron num-query-groups parameter and check for consistency.                                        |
-| Regularization function        | Regularization functions, common structures are LayerNorm, RMSNorm                     | The specified regularization function is used in MindSpore Transformers and cannot be modified by configuration. The configuration can be customized in Megatron by normalization to check for consistency. |
-| rms_norm_eps      | Regularized epsilon parameters                                          | Correspond to the Megatron layernorm_epsilon parameter and check for consistency.                                         |
-| dropout           | dropout in the network                                              | Currently, when MindSpore enables dropout, recalculation cannot be enabled; if precision comparison is carried out, it is recommended that both sides be closed to reduce the random factor.|
-| Fusion computation          | Common fusion operators include FA, ROPE, Norm, SwigLU; some users will fuse Wq, Wk, Wv for computation | 1. For accuracy comparison under the same hardware, if fusion algorithms are used, they should be consistent. <br>2. When comparing accuracy on different hardware, focus on checking whether there is any difference in the calculation of the fusion calculation part.    |
-
-#### MOE Structure
-
-| **Key parameters**&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;         | **Descriptions**                                                         | **CheckList**                                                                                                                                |
-| ----------------- | ------------------------------------------------------------ |------------------------------------------------------------------------------------------------------------------------------------|
-| expert_num               | Number of experts                                          | Correspond to the Megatron num-experts parameter and check for consistency.                    |
-| num_experts_chosen       | Number of experts selected per token                             | Correspond to the Megatron moe-router-topk parameter and check for consistency.                |
-| capacity_factor          | Expert capacity factor                                      | Correspond to the Megatron moe_expert_capacity_factor parameter and check for consistency. |
-| aux_loss_factor          | Load balancing loss contribution factor                              | When turned on, it is recommended to be less than 0.05. If precision alignment is performed, it is not recommended to be turned on, and is inconsistent with Megatron loss printing method. |
-| enable_sdrop             | Whether to enable the sdrop (drop implementation) method                                 | It is recommended to set it to true; the corresponding Megatron needs to set the following parameters:<br>  `moe-token-drop-policy: position` <br>  `moe-pad-expert-input-to-capacity: True` |
-| router_dense_type        | Decide the expert sense layer                                 | Configurable in MindSpore Transformers, FP32 calculations are recommended to prevent overflow; not configurable in Megatron. |
-| use_fused_ops_topkrouter | Whether to use the fusion operator for dispatch as well as combine indexing calculations | Fusion operator in MindSpore Transformers takes effect when `enable_sdrop=True`, precision alignment is recommended to be set to True. |
-| use_shared_expert_gating | Whether the gating factor is used in the shared expert network                  | Check if the network sharing expert has a gating factor, if so set it to True.       |
-
-### Optimizer CheckList
-
-| **Key parameters**          | **Descriptions**                                                         | **CheckList**                                                                                                                                |
-| ----------------- | ------------------------------------------------------------ |------------------------------------------------------------------------------------------------------------------------------------|
-| adam optimizer           | optimizer type             | If Megatron uses the adam optimizer, the mathematically equivalent implementation of MindSpore Transformers is AdamW. |
-| eps               | adam optimizer minimal value parameter   | Check the parameters for consistency, recommended value is 1e-8.                            |
-| beta1             | adam optimizer gradient momentum parameters | Check the parameters for consistency, recommended value is 0.9.                             |
-| beta2             | adam optimizer gradient variance parameter | Check the parameters for consistency, recommended value is 0.95.                            |
-| weight_decay      | weight decay               | By default bias and one-dimensional weights are not decayed and the user is checked for special operations.             |
-| lr                | learning rate                 | After setting up warmup, learning rate decay, draw a graph to see if the learning rate change is consistent.             |
-| lr_warmup_fraction      | Learning rate warmup step percentage     | After setting up warmup, learning rate decay, draw a graph to see if the learning rate change is consistent.                                        |
-| clip_grad         | clipping gradient               | Check the parameters for consistency, recommended value is 1.0.                             |
-| global_batch_size | Global batch size             | Consistency with the benchmark can be checked by printing a log during training.                    |
-
-### Weight CheckList
-
-| **Key parameters**          | **Descriptions**                                                         | **CheckList**                                                                                                                                |
-| ----------------- | ------------------------------------------------------------ |------------------------------------------------------------------------------------------------------------------------------------|
-| param_init_type | Weight initialization type       | MindSpore Transformers usually sets the param_init_dtype type to FP32. This is because the gradient communication type needs to be the same as the weight type, controlling the communication type to be FP32. Megatron gradient communication type defaults to FP32 and is not tied to the weight type. |
-| init-method-std | Distribution of weights randomly initialized | If weighted random initialization is used, parameters such as mean/std in the random distribution need to be checked for consistency. |
-
-### Mixed-precision CheckList
-
-| **Key parameters**          | **Descriptions**     | **CheckList**                |
-| ----------------- | ----------------------------------------- |---------------------------------------|
-| compute_dtype          | Compute accuracy                   | Megatron set `-bf16: true` to BF16, otherwise FP16.  |
-| layernorm_compute_type | LayerNorm/RMSNorm compute precision | Megatron is not configurable, need to check that implementations are consistent.                 |
-| softmax_compute_type   | When MindSpore uses FA, the internal Softmax fix is calculated with FA. Type of calculation is configurable only for small arithmetic splicing implementations     | Megatron is not configurable, needs to check if the implementation is consistent.                 |
-| rotary_dtype           | Calculation accuracy of rotary position encoding                                       | Megatron is not configurable, needs to check if the implementation is consistent. |
-| Calculation of weights             | accuracy calculation for each weight such as, Embedding, lm_head | Since MindSpore Transformers weight initialization needs to be set to FP32, and the usual calculation precision is BF16/FP16, it is necessary to check whether the weight data type is converted to BF16/FP16 before weight calculation.|
-| bias add               | bias in the linear layer                                                 | If bias is present, Linear layer checks consistency in the computational accuracy of add.                  |
-| residual add           | sum of residuals                                                     | Check that the accuracy of the calculation of the residuals is consistent with the benchmarks                             |
-| loss                   | Loss Calculation Module               | Check that the accuracy of the calculation in the entire loss module is consistent with the benchmarks                     |
-| Operator High Precision Mode         | Ascend Calculator supports high precision mode                                       | Method:  `context.set_context(ascend_config=  {"ge_options":{  "global":{  "ge.opSelectImplmode":"high_precision"  }  }  })` |
-
-### Parallel Strategy CheckList
-
-| **Key parameters**&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;          | **Descriptions**     | **CheckList**  |
-| ----------------- | ------------------------------------------------------------ |------------------------------------------------------------------------------------------------------------------------------------|
-| data_parallel              | data parallel                               | Parallel slicing affects the communication behavior, and the calculations that introduce communication after slicing may be slightly different from the single-card calculations.                    |
-| model_parallel             | model parallel                               | Parallel slicing affects the communication behavior, and the calculations that introduce communication after slicing may be slightly different from the single-card calculations.      |
-| pipeline_stage             | pipeline parallel                              | Parallel slicing affects the communication behavior, and the calculations that introduce communication after slicing may be slightly different from the single-card calculations.             |
-| use_seq_parallel           | Corresponding to Megatron Short Sequence Parallelism | Parallel slicing affects the communication behavior, and the calculations that introduce communication after slicing may be slightly different from the single-card calculations.      |
-| enable_parallel_optimizer  | optimizer parallel                             | For optimizer parallel, MindSpore and PyTorch have different implementation schemes and inconsistent communication behavior. It is recommended to turn it off when performing precision alignment. |
-| micro_batch_interleave_num | multicopy parallel                             | For optimizer parallel, MindSpore and PyTorch have different implementation schemes and inconsistent communication behavior. It is recommended to turn it off. |
-
-### Other CheckList
-
-| **Key parameters**          |  **CheckList**               |
-| ----------------- | ---------------------------|
-| Data Check | Check if the data is abnormal, you can randomly select part of the data for decode, encode check to see if the position of input and label is correctly corresponding.                                  |
-| Special Words Check | Check whether the special ids such as bos_token_id, eos_token_id, pad_token_id are consistent with the ids when the data is produced.                              |
-| inputs_id check | Check whether inputs_id in Embedding is consistent with 0<=inputs_id<vocab_size; if there is out-of-bounds behavior, it will fetch dirty data and lead to precision anomaly.                       |
-| Overflow Detection | Overflow Status Aligns PyTorch, suggest to use INFNAN_MODE, i.e., `export MS_ASCEND_CHECK_OVERFLOW_MODE=INFNAN_MODE`. |
-| Graph Operator Fusion | Turn off graph operator fusion, i.e. `enable_graph_kernel: False`. |
-| Training Inference Template Consistency | If training SFT, you need to make sure that the input template used for training inference is consistent.  |
-| Version Check | Check whether the versions of MindSpore, MindSpore Transformers and CANN are compatible, it is recommended to use the latest compatible version.          |
-| Differences with Open Source | MindSpore Transformers has supported the mainstream open source LLM models, and has been more fully tested. If you are developing based on the open source models in MindSpore Transformers, you can focus on checking the differences with the open source models in MindSpore Transformers. |
-
-## Introduction to Accuracy Debugging Tools
-
-In accuracy localization, MindSpore's Dump tool is mainly used. For details, please refer to [Dump Function Debugging](https://www.mindspore.cn/tutorials/en/r2.6.0/debug/dump.html).
-
-MindSpore's Dump tool is enabled by configuring a JSON file, which Dumps out all the operator data in the network, saving the tensor and statistics in the statistic.csv table. The following gives a JSON example of full operator Dump:
-
-```json
-{
-    "common_dump_settings": {
-        "op_debug_mode": 0,
-        "dump_mode": 0,
-        "path": "/absolute_path",
-        "net_name": "ResNet50",
-        "iteration": "0|5-8|100-120",
-        "saved_data": "tensor",
-        "input_output": 0,
-        "kernels": ["Default/Conv-op12"],
-        "support_device": [0,1,2,3,4,5,6,7]
-    },
-    "e2e_dump_settings": {
-        "enable": true,
-        "trans_flag": true
-    }
-}
-```
-
-Refer to [Dump Function Debug](https://www.mindspore.cn/tutorials/en/r2.6.0/debug/dump.html) for the field meanings of the configuration parameters.
-
-After configuring the JSON file, set the Dump environment variable to point to the configured JSON file, you need to set the absolute path:
-
-```shell
-export MINDSPORE_DUMP_CONFIG=${JSON_PATH}
-```
-
-After setting the environment variables, start the program training to get the corresponding Dump data.
-
-### Other Introductions
-
-In addition to the full amount of operator Dump introduced above, the tool also supports partial data Dump, overflow Dump, specified-condition Dump and so on. Limited to space, interested users can refer to [Dump function debugging](https://www.mindspore.cn/tutorials/en/r2.6.0/debug/dump.html) for configuration and use. In addition, TroubleShooter web development debugging is also provided, can be used in the weight conversion, weight comparison and other scenarios. For more information, refer to [TroubleShooter tool introduction](https://gitee.com/mindspore/toolkits/tree/master/troubleshooter).
-
-## Generalized Processes for Accuracy Positioning
-
-Quickly troubleshoot the problem by using the [Accuracy Problems Location CheckList](#accuracy-problems-location-checklist) section. If the accuracy problem still exists after completing the CheckList and there is no obvious direction, you can narrow down the scope of the problem by using the accuracy location generic process in this section for further troubleshooting. The current generalized process is mainly for benchmarked scenarios, and the following section will take the scenario of comparing the accuracy of GPU+PyTorch and Ascend+MindSpore as an example to introduce the accuracy localization process.
-
-There are two main ideas for problem positioning:
-
-* Simplified training scenarios based on single card/standalone, small-scale model replication problems.
-* Fix the random factor and compare the loss difference with the benchmark during training to locate the cause of the accuracy difference.
-
-The training process of the model can be decomposed into the following processes: data input, forward computation, loss, backward computation, gradient, optimizer weight update, and next step. The following will describe how to rank each stage of the training in conjunction with the flow of the following figure.
-
-![general_process](./image/general_process.png)
-
-### Stage 1: Pre-training Preparation
-
-Conducting accuracy comparison between GPU+PyTorch and Ascend+MindSpore requires simplifying the scenario and fixing the randomness before reproducing the problem. There are three main parts as follows:
-
-* Aligning parameters, downsizing models, single-card/stand-alone reproduction problems;
-
-* Load the same weights for training;
-
-* Each step trains the same data.
-
-#### Aligning Parameters
-
-In the parameter alignment session, the following parameters need special instructions, to ensure that PyTorch and MindSpore parameters are consistent. Parameter setting instructions:
-
-| Parameters                 | Suggestions | Descriptions                            |
-|--------------------| -------- |-------------------------------|
-| num_layers         | 2        | Reduced model size facilitates quick verification that a single card can run in data-only parallelism. |
-| learning_rate_type | constant | Fixed learning rates to ensure alignment with benchmarked learning rates.             |
-| warmup_steps       | 0        | Steps for warmup                     |
-| adam_eps           | 1e-8     | If the user has no special requirements, follow the default settings.             |
-| dropout            | 0        | Turn off the randomness parameter, and If there are other randomness parameters, they should be turned off.         |
-
-Since features such as model parallelism, flow parallelism, sequence parallelism, optimizer parallelism increase the difficulty of precision alignment, it is recommended to turn them off first, and then gradually add parallel features after alignment.
-
-#### Weight Conversion
-
-During training, MindSpore is loaded with the same weights as PyTorch. In case of pre-training scenarios, you can use PyTorch to save an initialized weight and then convert it to MindSpore weights. Because MindSpore weight names differ from PyTorch, the essence of weight conversion is to change the names in the PyTorch weight dict to MindSpore weight names to support MindSpore loading. Refer to [weight conversion guide](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/function/weight_conversion.html) for weight conversion.
-
-Both MindSpore and PyTorch support `bin` format data, loading the same dataset for training ensures consistency from step to step.
-
-#### Fixed Randomness and Start Deterministic Computation
-
-The training process fixes randomness and turns on deterministic computation in the following way:
-
-* NPU adds the following environment variables:
-
-  ```shell
-  export HCCL_DETERMINISTIC=true  # HCCL deterministic
-  export ASCEND_LAUNCH_BLOCKING=1  # Hardware deterministic
-  ```
-
-* PyTorch code, in [pretrain_gpt.py](https://github.com/NVIDIA/Megatron-LM/blob/main/pretrain_gpt.py), the new seed_all method is added and called in the main method, adding the method as follows:
-
-  ```python
-  import numpy as np
-  import random
-
-  def seed_all(seed=42):
-      random.seed(seed)
-      os.environ['PYTHONHASHSEED'] = str(seed)
-      np.random.seed(seed)
-      torch.manual_seed(seed)
-      torch.use_deterministic_algorithms(True)
-      torch.cuda.manual_seed_all(seed)
-      torch.cuda.manual_seed(seed)
-      torch.backends.cudnn.deterministic = True
-      torch.backends.cudnn.enable = False
-      torch.backends.cudnn.benchmark = False
-
-  if __name__ == "__main__":
-      seed_all()
-
-      # Original code
-  ```
-
-* MindSpore code, in [run_mindformer.py](https://gitee.com/mindspore/mindformers/blob/r1.5.0/run_mindformer.py), the new seed_all method is added and called in the main method, adding the method as follows:
-
-  ```python
-  import numpy as np
-  import random
-
-  import mindspore
-
-  def seed_all(seed=42):
-      random.seed(seed)
-      os.environ['PYTHONHASHSEED'] = str(seed)
-      np.random.seed(seed)
-      mindspore.set_deterministic(True)
-
-  def main(config):
-      seed_all()
-
-      # Original code
-  ```
-
-After completing the above preparations, single card training is initiated. If the problem is not reproduced, the scenario is expanded, such as adding relevant features, expanding the model size, etc., until the problem is reproduced, so as to locate the cause of the problem. If the problem is reproduced, or the time needed to reproduce is longer, then the problem localization in stage 2 can be opened.
-
-### Stage 2: Basic Problem Identification
-
-By comparing the loss and local norm of the first step (step1) and the second step (step2), the forward computation, backward computation, and optimizer computation are sequentially ranked.
-
-#### Comparison of Step1 Losses
-
-After fixing the weights, dataset, and randomness, the difference in the loss value of the first step of training is compared. The loss value of the first step is obtained from the forward computation of the network. If the difference with the benchmark loss is large, it can be determined that there is an accuracy difference in the forward computation, which may be due to the model structure is not aligned, and the accuracy of the operator is abnormal. The tensor values of each layer of MindSpore and PyTorch can be obtained by printing or Dump tool. Currently, the tool does not have automatic comparison function, users need to manually identify the correspondence for comparison. For the introduction of MindSpore Dump tool, please refer to [Introduction of Accuracy Debugging Tools](#introduction-to-accuracy-debugging-tools), and for the use of PyTorch Dump tool, please refer to [Function Explanation of Accuracy Tools](https://gitee.com/ascend/mstt/blob/master/debug/accuracy_tools/msprobe/docs/05.data_dump_PyTorch.md).
-
-Find the correspondence of layers through PyTorch api_stack_dump.pkl file, and MindSpore statistic.csv file, and initially determine the degree of difference between input and output through max, min, and L2Norm. If you need further comparison, you can load the corresponding npy data for detailed comparison.
-
-#### Comparison of local norm Values for step1
-
-The local norm reflects the sum of squares of the gradients of a given weighted slice on that device, and comparing the local norm value with the benchmark allows for an initial assessment of the difference in the reverse computation. The calculation formula is as follows:
-
-$$
-localnorm = \sqrt{x_1^2 + x_2^2 + \cdots + x_n^2}
-$$
-
-Where $x_1 , x_2, \cdots, x_n$ is the gradient of a particular weight. MindSpore Transformers supports printing the local norm via yaml configuration as shown below:
-
-```yaml
-# wrapper cell config
-runner_wrapper:
-  type: MFTrainOneStepCell
-  local_norm: True
-  scale_sense: 1
-  loss_scale_value: 65536
-  use_clip_grad: True
-```
-
-There is no configuration in Megatron to print local parameters, and you need to embedded modify the file [megatron/core/optimizer/optimizer.py](https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/optimizer/optimizer.py):
-
-```python
-from megatron.training import get_args, print_rank_0
-
-def get_parameters(self):
-    params = []
-    grad_norm_list = []
-    for param_group in self.optimizer.param_groups:
-        for param in param_group['params']:
-            grad_norm = torch.norm(param.grad, 2)
-            grad_norm_list.append(grad_norm ** 2)
-            params.append(param)
-    # Embedded modifications
-    print_rank_0(f"print torch local norm:")
-    print_rank_0(grad_norm_list)
-    return params
-```
-
-Below is an example of a local norm comparison, comparing the local norm values corresponding to the weights.
-
-![local norm](./image/local_norm.png)
-
-It can be found that in the scenario shown in this figure, the local norm value of model.tok_embeddings.embedding_weight has a large difference, which can be focused on troubleshooting the implementation of the Embedding and the calculation accuracy, etc.
-
-The local norm value only serves as a preliminary judgment of whether the reverse computation is correct, if we want to compare the reverse computation in depth, we need to compare the MindSpore and PyTorch reverse computation values layer by layer by using the Dump tool.
-
-#### Optimizer Computational Troubleshooting
-
-In the case where the loss of step1 is aligned with the local norm, if the difference in the loss of step2 is large, further troubleshooting of the optimizer computation is required. The specific steps are as follows:
-
-1. Firstly, check whether the parameters that affect the gradient update, such as checking learning rate, optimizer parameters, weight decay, are consistent with the benchmark.
-
-2. Secondly, troubleshoot the optimizer computation with the following steps:
-    1. Save the gradient from PyTorch step1.
-
-    2. Load the gradient of PyTorch at MindSpore step1 for optimizer update.
-
-    3. Compare the difference in weights after the update or the difference in loss values at step2.
-
-If there is a significant difference, there is a problem with the optimizer update and further targeting of the optimizer is required.
-
-PyTorch saves the weight gradients, and to use apex as an example, modify [megatron/core/optimizer/optimizer.py](https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/optimizer/optimizer.py) file.
-
-```python
-import numpy as np
-
-def get_parameters(self):
-    params = []
-    grad_id = 0
-    for param_group in self.optimizer.param_groups:
-        for param in param_group['params']:
-            params.append(param)
-            grad_id += 1
-            # Embedded modification to save the gradient of torch as numpy
-            np.save(f"xx/grad_{grad_id}.npy", param)
-    return params
-```
-
-For MindSpore Transformers loading gradient, refer to [mindformers/wrapper/wrapper.py](https://gitee.com/mindspore/mindformers/blob/r1.5.0/mindformers/wrapper/wrapper.py) implementation. Note that users need to find the correspondence between MindSpore Transformers and PyTorch gradient. Refer to the following modified code:
-
-```python
-class MFTrainOneStepCell(nn.TrainOneStepWithLossScaleCell):
-    ...
-    def __init__(self):
-        # Embedded modification to load the weight of torch
-        grad_0 = Tensor(np.load(f"xxx/grad_1.npy"))
-        grad_1 = Tensor(np.load(f"xxx/grad_x.npy"))
-        ...
-        self.grads = [grad_0, grad_1, ..., ]
-
-    def construct(self, *inputs):
-        ...
-        # Embedded modification to force replacement of gradient with torch gradient
-        grads = self.grads
-        if self.use_clip_grad:
-            grads, global_norm = self.clip_grad_norm(grads)
-```
-
-The above code, only for the realization of the reference, needs to modify the code according to the actual situation.
-
-If you troubleshoot that there is no problem with the optimizer computation, and at the same time there is a large difference in the loss of step2, you need to re-compare the reverse computation of step1 in detail by means of Dump.
-
-### Stage 3: Long and Stable Training Troubleshooting
-
-After the above operations of aligning the loss and local norm of step1 and step2, troubleshooting the forward computation, backward computation, and optimizer update, long stable training is initiated to compare the loss of each step.
-
-#### Weights Not Updated
-
-Set learning rate = 0, i.e., weights are not updated, and train 1k step; compare the loss values and the differences in the global norm. At the current stage, due to the large amount of data, detailed comparison of the local norm of each weight for each step is labor intensive, so the backward computation error is determined by comparing the global norm. This is a simple and quick way to verify the forward and backward computation, if there is a large difference in the value of a particular step loss or norm, then use that data alone to analyze the forward and backward. Note that global norm prints in the Megatron with the field grad norm.
-
-#### Benchmark Error Confirmation
-
-Before the training of weight update, it is necessary to confirm the benchmark error, that is, turn off the deterministic computation, repeat running the benchmark training twice to see the error of the benchmark itself, to determine whether the error is reasonable. Due to the differences in hardware or the underlying calling operator, the computational process of training will inevitably have a certain degree of error. When a loss comparison is performed between MindSpore and benchmarking model, if the error is within the benchmark error range and the error fluctuates up and down along the 0-axis, the error can be considered reasonable.
-
-#### Loss Diffusion
-
-The learning rate is set > 0, the weights are updated, and the long stability test is performed. The training to a certain step appeared the phenomenon of large differences in the loss, after which the training loss began to diverge, as shown in Fig:
-
-![loss1](./image/loss1.png)
-
-In this scenario, the training before and after the mutation can be targeted for troubleshooting, and the following troubleshooting can be tried:
-
-* Check the data situation near the loss mutation to troubleshoot if there is any abnormal data. Decode the data to text via tokenizer to see if the data is abnormal; at the same time, you can try to skip this batch of data for training to verify whether it is caused by the data.
-
-* Check if there is precision overflow in the vicinity of the mutation.
-
-* You can check whether there is any abnormality in the local norm, check the training data of the Dump mutation step, troubleshoot the calculated mutation points, and analyze whether the operator outputs abnormally.
-
-#### Loss Varies Greatly in the Later Stages
-
-It is also possible to have a better fit in the early part of the training period and a large difference in the convergence loss in the later part of the training period in the long stability test, as shown in Fig:
-
-![loss2](./image/loss2.png)
-
-In this scenario, troubleshooting can be done from the following perspectives:
-
-* Examine whether the parameters are aligned: focus on examining the parameters related to the optimizer, such as the optimizer type, learning rate, weight decay. We can compare whether the change of learning rate during training is consistent by drawing diagrams, and we also need to confirm whether the weight of weight decay is consistent with the benchmark.
-
-* Mixed accuracy checking: through the Dump tool, carefully check whether the mixed accuracy is consistent with the benchmark in the calculation process;
-
-* If there is a difference in the loss at convergence, but the difference is small, such as less than 1%, the accuracy acceptance can be performed by evaluating the downstream tasks.
-
-#### Scenario Expansion
-
-After completing the single-card alignment, gradually expand from single-card to multi-card testing and cluster testing; model size and related features such as model parallelism, flow parallelism, optimizer parallelism are added as appropriate. Gradually expand from simple scenarios to actual training scenarios, so as to troubleshoot the impact of the added features on the accuracy.
-
-### Large Model Migration Accuracy Standard
-
-Accuracy standard for large model migration refers to the accuracy standard set for key indicators to ensure that the model accuracy before and after migration is basically the same after migrating the models trained by other third-party hardware or frameworks to MindSpore and Ascend Hardware. It is summarized based on the actual migration scenarios of MindSpore's large models for developers' reference. Since the accuracy of large models is strongly related to the application domain, model structure, number of parameters, and hyperparameters, and is not fully interpretable, there is no complete and unified mandatory standard. Therefore, this standard is only used as a reference standard to help users make a basic judgment on the accuracy of model migration.
-
-#### Accuracy Standard Specifications
-
-1. Relative discrepancy is uniformly described as a percentage (x.x%) and absolute discrepancy is uniformly described as a decimal (0.xx);
-2. If the accuracy fluctuations of the third-party model training no longer meet this accuracy standard, the original model should be adequately tested and the standard should be relaxed in accordance with the fluctuations of the original model;
-
-#### Default Configuration
-
-| Classes               | Default Values | Descriptions                      |
-|--------------------|------|-------------------------------|
-| Dataset         | [pretrain] wikitext-103 </br>[sft] alpaca   | |
-| Accuracy mode       | BF16   | Mixed-accuracy configurations are consistent, and distinguish between actual FP32/FP16/BF16 configurations for each API in the network.             |
-| Parallel method       | Data parallel    | The parallelism can be adjusted according to the computational resources. |
-| Cluster size       | Stand-alone 8 cards | Can be adjusted according to the computational resources.             |
-| checkpoint     | [pretrain] Script initialization by default </br> [sft]Loading pre-training weights    | ckpt has a large impact on the accuracy metrics, prioritizing weights with small fluctuations in loss and a clear downward trend in overall loss.|
-|determinism|Turn on|The accuracy indicator determination phase can turn off determinism. The comparison phase needs to turn on determinism in order to minimize random error interference.|
-
-#### Accuracy Standard Indicator
-
-* Test Standard
-
-    1. Without user's special designation, the default continuous observation is 5000 steps or 12 hours, the number of steps can be reduced according to the resource situation, but it is not recommended to be less than 1000 steps.
-    2. Load the same weights, keep all hyperparameters configured the same, and turn off all randomness.
-    3. The fluctuation of indicators such as loss is greatly influenced by the model, weights, and hyperparameters, and the combination with smooth loss fluctuation is preferred as a benchmark to reduce the judgment of random fluctuation on the accuracy results.
-    4. The randomness of the third-party model was adequately tested by repeating the experiment at least 2 times with determinism turned off and observing the range of fluctuations in the accuracy metrics.
-
-* loss Accuracy Standard
-
-    1. The absolute error of first loss is less than 0.005, or the relative error is less than 0.5%.
-    2. The average absolute error is less than 0.01, or the average relative error is less than 1%.
-
-* Monitoring Indicators
-
-    The average relative error of the global norm does not exceed 10%.
-
-### Case Details
-
-This section will introduce the completion of accuracy ranking based on the above accuracy localization process with practical examples.
-
-#### Problem Phenomenon
-
-Training the model with a 128-card cluster and comparing training with Ascend+MindSpore training with GPU+PyTorch training reveals that the late training convergence loss is about 0.1 higher than GPU+PyTorch. As shown in the figure, the convergence is not as expected:
-
-![loss3](./image/loss3.png)
-
-The red line is the Ascend+MindSpore training curve and the blue line is the GPU+PyTorch training curve.
-
-#### Problem Location Process
-
-Before locating the problem, check against the CheckList to confirm that there is no error and then start locating the problem.
-
-First the loss alignment of step1 is confirmed to be OK. Comparing the local norm of step1 and calculating the difference between the local norm value of each weight and the benchmark, it is found that the local norm value of Embedding weight has a large difference with the benchmark.
-
-![local norm](./image/local_norm.png)
-
-The reason for this is that MindSpore Transformers uses FP32 for weight initialization, and FP32 precision is used for both forward and backward Embedding calculations, while PyTorch forward and backward calculations are BF16, which leads to differences in the calculated local norm values.
-
-Once the computational accuracy is aligned, the exhaustive optimizer computation is also fine, and the long stable training alignment starts.
-
-The long stable training exhaustion will be extended from single card experiments to multi-card experiments by first setting the LEARNING RATE=0, i.e., the weights are not updated. Forward computation of the loss difference of each step is around 0.001, and the forward computation error is as expected. The difference of global norm of each step is about 0.05, and the difference of reverse calculation is not significant. It is initially judged that the model migration code is correct, the model structure is consistent, and the difference of forward and reverse calculation is not significant.
-
-![loss4](./image/loss4.png)
-
-Re-weight update, single card training, set learning rate=1e-5, train 1k steps. Convergence late loss has a steady 0.1 difference, reproducing the problem.
-
-![loss5](./image/loss5.png)
-
-Perform problem troubleshooting. Identify the following problems:
-
-* Identify inconsistencies in computational accuracy during training through Dump file exclusion, and harmonize inconsistencies.
-
-* Weight decay implementation is inconsistent, weight decay is performed on all weights in user PyTorch network. bias weights and one-dimensional weights in MindSpore Transformers do not have weight decay by default.
-
-After fixing the problem, experiment again, train 10,000 steps, the loss difference fluctuates around the 0 axis and is less than 0.03, the accuracy meets the expectation, and the single-card accuracy is aligned.
-
-After completing the single card training, start the multi-card training test: set the learning rate=1e-5, train 1,000 steps. convergence is consistent in the late stage of training, but there is a stable 0.05 error in the middle stage of training.
-
-![loss6](./image/loss6.png)
-
-To verify that this error is within reasonable limits, the deterministic computation was turned off and the GPU experiment was run twice repeatedly. The red line in the figure is the curve of MindSpore training, and the blue and green lines are the curves of the first and second GPU training, respectively. At the training instability around 7,000 steps, the curve of MindSpore training is right between the curves of the two GPU trainings, indicating that the error is within a reasonable range and the problem is finally solved.
-
-![loss7](./image/loss7.png)
diff --git a/docs/mindformers/docs/source_en/acc_optimize/image/general_process.png b/docs/mindformers/docs/source_en/acc_optimize/image/general_process.png
deleted file mode 100644
index ee26be669b8fe9b41baf0328cc8bf2acf347dd65..0000000000000000000000000000000000000000
Binary files a/docs/mindformers/docs/source_en/acc_optimize/image/general_process.png and /dev/null differ
diff --git a/docs/mindformers/docs/source_en/acc_optimize/image/local_norm.png b/docs/mindformers/docs/source_en/acc_optimize/image/local_norm.png
deleted file mode 100644
index c648c187c6be5da9dc29c360f5c527fb0d40b644..0000000000000000000000000000000000000000
Binary files a/docs/mindformers/docs/source_en/acc_optimize/image/local_norm.png and /dev/null differ
diff --git a/docs/mindformers/docs/source_en/acc_optimize/image/loss1.png b/docs/mindformers/docs/source_en/acc_optimize/image/loss1.png
deleted file mode 100644
index c665b20eaf5ff0b40f0da7c6dd7724cc219e9491..0000000000000000000000000000000000000000
Binary files a/docs/mindformers/docs/source_en/acc_optimize/image/loss1.png and /dev/null differ
diff --git a/docs/mindformers/docs/source_en/acc_optimize/image/loss2.png b/docs/mindformers/docs/source_en/acc_optimize/image/loss2.png
deleted file mode 100644
index fef240e4e62ddb3b342877efd0c0c6e908462dff..0000000000000000000000000000000000000000
Binary files a/docs/mindformers/docs/source_en/acc_optimize/image/loss2.png and /dev/null differ
diff --git a/docs/mindformers/docs/source_en/acc_optimize/image/loss3.png b/docs/mindformers/docs/source_en/acc_optimize/image/loss3.png
deleted file mode 100644
index 15cfd9315ec6ad44caf532e0901d71fb8dfc3c80..0000000000000000000000000000000000000000
Binary files a/docs/mindformers/docs/source_en/acc_optimize/image/loss3.png and /dev/null differ
diff --git a/docs/mindformers/docs/source_en/acc_optimize/image/loss4.png b/docs/mindformers/docs/source_en/acc_optimize/image/loss4.png
deleted file mode 100644
index 130916fcfa1b42dcc3f49cc4833fa6cf449d40da..0000000000000000000000000000000000000000
Binary files a/docs/mindformers/docs/source_en/acc_optimize/image/loss4.png and /dev/null differ
diff --git a/docs/mindformers/docs/source_en/acc_optimize/image/loss5.png b/docs/mindformers/docs/source_en/acc_optimize/image/loss5.png
deleted file mode 100644
index aeac937ce8ef54e462ee81de5b1e5eaf7178a768..0000000000000000000000000000000000000000
Binary files a/docs/mindformers/docs/source_en/acc_optimize/image/loss5.png and /dev/null differ
diff --git a/docs/mindformers/docs/source_en/acc_optimize/image/loss6.png b/docs/mindformers/docs/source_en/acc_optimize/image/loss6.png
deleted file mode 100644
index c4061f5c18e886d1036001c0d509e0a3974b8684..0000000000000000000000000000000000000000
Binary files a/docs/mindformers/docs/source_en/acc_optimize/image/loss6.png and /dev/null differ
diff --git a/docs/mindformers/docs/source_en/acc_optimize/image/loss7.png b/docs/mindformers/docs/source_en/acc_optimize/image/loss7.png
deleted file mode 100644
index 58ecc6e3ee9da518b9b77be06df7c825e0ddb6fa..0000000000000000000000000000000000000000
Binary files a/docs/mindformers/docs/source_en/acc_optimize/image/loss7.png and /dev/null differ
diff --git a/docs/mindformers/docs/source_en/appendix/conf_files.md b/docs/mindformers/docs/source_en/appendix/conf_files.md
deleted file mode 100644
index fb0f578dd722f82e6b2f1f40417408950d1f6024..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_en/appendix/conf_files.md
+++ /dev/null
@@ -1,329 +0,0 @@
-# Configuration File Descriptions
-
-[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_en/appendix/conf_files.md)
-
-## Overview
-
-Different parameters usually need to be configured during the training and inference process of a model. MindSpore Transformers supports the use of `YAML` files to centrally manage and adjust the configurable items, which makes the configuration of the model more structured and improves its maintainability at the same time.
-
-## Description of the YAML File Contents
-
-The `YAML` file provided by MindSpore Transformers contains configuration items for different functions, which are described below according to their contents.
-
-### Basic Configuration
-
-The basic configuration is mainly used to specify MindSpore random seeds and related settings for loading weights.
-
-| Parameters           | Descriptions                                                                                                                                                                                                                                                                                                                                                                                                                                                          | Types |
-|-----------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------|
-| seed            | Set the global seed. For details, refer to [mindspore.set_seed](https://www.mindspore.cn/docs/en/r2.6.0/api_python/mindspore/mindspore.set_seed.html).                                                                                                                                                                                                                                                                                                                 | int   |
-| run_mode        | Set the running mode of the model: `train`, `finetune`, `eval` or `predict`.                                                                                                                                                                                                                                                                                                                                                                                          | str   |
-| output_dir      | Set the path where log, checkpoint, strategy, etc. files are saved.                                                                                                                                                                                                                                                                                                                                                                                                   | str   |
-| load_checkpoint | File or folder paths for loading weights. Currently there are 3 application scenarios<br/>1. Support for passing in full weight file paths.<br/>2. Support for passing in offline sliced weight folder paths.<br/>3. Support for passing in folder paths containing lora weights and base weights<br/>Refer to [Weight Conversion Function](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/function/weight_conversion.html) for the ways of obtaining various weights. | str   |
-| auto_trans_ckpt | Enable online weight automatic conversion. Refer to [Weight Conversion Function](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/function/weight_conversion.html).                                                                                                                                                                                                                                                                                                    | bool  |
-| resume_training | Enable resumable training after breakpoint. For details, refer to [Resumable Training After Breakpoint](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/function/resume_training.html#resumable-training).                                                                                                                                                                                                                                                            | bool  |
-| load_ckpt_format| The format of loading checkpoint, either `ckpt` or `safetensors`.                                                                                                                                                                                                                                                                                                                                                                                                      | str   |
-| remove_redundancy  | Whether the checkpoint has removed redundancy while loading checkpoint. The default value is `False`.                                                                                                                                                                                                                                                                                                                                                                  | bool  |
-| train_precision_sync | Switching on or off deterministic computation of the training process. The default value is `None`.                                                                                                                                                                                                                                                                                                                                                                                                 | Optional[bool] |
-| infer_precision_sync | Switching on or off deterministic computation of the inference process. The default value is `None`.                                                                                                                                                                                                                                                                                                                                                                                               | Optional[bool] |
-
-### Context Configuration
-
-Context configuration is mainly used to specify the [mindspore.set_context](https://www.mindspore.cn/docs/en/r2.6.0/api_python/mindspore/mindspore.set_context.html) in the related parameters.
-
-| Parameters              | Descriptions                                                                                                                                                                                                                      | Types   |
-|-----------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------|----------|
-| context.mode                | Set the backend execution mode, `0` means GRAPH_MODE. MindSpore Transformers currently only supports running in GRAPH_MODE mode.                                                                                          | int      |
-| context.device_target       | Set the backend execution device. MindSpore Transformers is only supported on `Ascend` devices.                                                                                                              | str      |
-| context.device_id           | Set the execution device ID. The value must be within the range of available devices, and the default value is `0`.                                                                                                                      | int      |
-| context.enable_graph_kernel | Enable graph fusion to optimize network execution performance, defaults to `False`.                    | bool     |
-| context.max_call_depth      | Set the maximum depth of a function call. The value must be a positive integer, and the default value is `1000`.                                                                                                                    | int      |
-| context.max_device_memory   | Set the maximum memory available to the device in the format “xxGB”, and the default value is `1024GB`.                                                                                                                 | str      |
-| context.mempool_block_size | Set the size of the memory pool block for devices. The format is "xxGB". Default value is `"1GB"`. | str |
-| context.save_graphs         | Save the compilation graph during execution.<br/>1. `False` or `0` indicates that the intermediate compilation map is not saved.<br/>2. `1` means outputting some of the intermediate files generated during the compilation of the diagram.<br/>3. `True` or `2` indicates the generation of more backend-process-related IR files. <br/>4. `3` indicates the generation of visualized computational diagrams and more detailed front-end IR diagrams. | bool/int |
-| context.save_graphs_path | Path for saving the compilation diagram. | str |
-| context.affinity_cpu_list   | Optional configuration option, used to implement user-defined binding policies. Enable default binding policy when not configured. `None` means to disable the binding function. Default value is `{}`. If you want to enable custom binding policies, you need to pass in' dict '. See [mindspore.runtime.set_cpu_affinity](https://www.mindspore.cn/docs/en/r2.6.0/api_python/runtime/mindspore.runtime.set_cpu_affinity.html) for details. | dict/str      |
-
-### Model Configuration
-
-Since the configuration will vary from model to model, only the generic configuration of models in MindSpore Transformers is described here.
-
-| Parameters              | Descriptions                                                                                                                                                                                                                      | Types   |
-|--------------------------------------------|--------------------------------------------------------------------------------------------------|------|
-| model.arch.type                            | Set the model class to instantiate the model according to the model class when constructing the model.                                                                       | str  |
-| model.model_config.type                    | Set the model configuration class, the model configuration class needs to match the model class to be used, i.e. the model configuration class should contain all the parameters used by the model class.                                                     | str  |
-| model.model_config.num_layers              | Set the number of model layers, usually the number of layers in the model Decoder Layer.                                                                     | int  |
-| model.model_config.seq_length              | Set the model sequence length, this parameter indicates the maximum sequence length supported by the model.                                                                       | int  |
-| model.model_config.hidden_size             | Set the dimension of the model hidden state.                                                                                      | int  |
-| model.model_config.vocab_size              | Set the model word list size.                                                                                         | int  |
-| model.model_config.top_k                   | Sample from the `top_k` tokens with the highest probability during inference.                                                                     | int  |
-| model.model_config.top_p                   | Sample from tokens that have the highest probability and whose probability accumulation does not exceed `top_p` during inference.                                                              | int  |
-| model.model_config.use_past                | Turn on model incremental inference, when turned on you can use Paged Attention to improve inference performance, must be set to `False` during model training.                                          | bool |
-| model.model_config.max_decode_length       | Set the maximum length of the generated text, including the input length.                                                                               | int  |
-| model.model_config.max_length              | The descriptions are same as `max_decode_length`. When set together with `max_decode_length`, `max_length` takes effect.                                    | int  |
-| model.model_config.max_new_tokens          | Set the maximum length of the generated new text, excluding the input length, when set together with `max_length`, `max_new_tokens` takes effect.                                       | int  |
-| model.model_config.min_length              | Set the minimum length of the generated text, including the input length.                                                                               | int  |
-| model.model_config.min_new_tokens          | Set the minimum length of the new text to be generated, excluding the input length; when set together with `min_length`, `min_new_tokens` takes effect.                                       | int  |
-| model.model_config.repetition_penalty      | Set the penalty factor for generating duplicate text, `repetition_penalty` is not less than 1. When it equals to 1, duplicate outputs will not be penalized.                                            | int  |
-| model.model_config.block_size              | Set the size of the block in Paged Attention, only works if `use_past=True`.                                                   | int  |
-| model.model_config.num_blocks              | Set the total number of blocks in Paged Attention, effective only if `use_past=True`. `batch_size×seq_length<=block_size×num_blocks` should be satisfied. | int  |
-| model.model_config.return_dict_in_generate | Set to return the inference results of the `generate` interface as a dictionary, defaults to `False`.                                                            | bool |
-| model.model_config.output_scores           | Set to include score before the input softmax for each forward generation when returning the result as a dictionary, defaults to `False`.                                                  | bool |
-| model.model_config.output_logits           | Set to include the logits output by the model at each forward generation when returning results as a dictionary, defaults to `False`.                                                     | bool |
-| model.model_config.layers_per_stage        | Set the number of transformer layers assigned to each stage when enabling the pipeline stage, default is `None`, which means the transformer layers are evenly distributed across each stage. The set value is a list of integers with a length equal to the number of pipeline stages, where the i-th element indicates the number of transformer layers assigned to the i-th stage.                                             | list |
-
-### MoE Configuration
-
-In addition to the basic configuration of the model above, the MoE model needs to be configured separately with some superparameters of the moe module, and since the parameters used will vary from model to model, only the generic configuration will be explained:
-
-| Parameters                                         | Descriptions                                                                                               | Types   |
-|--------------------------------------------|--------------------------------------------------------------------------------------------------|------|
-| moe_config.expert_num                    | Set the number of routing experts.                                                    | int  |
-| moe_config.shared_expert_num                    | Set the number of sharing experts.                                                     | int  |
-| moe_config.moe_intermediate_size                    | Set the size of the intermediate dimension of the expert layer.                                                     | int  |
-| moe_config.capacity_factor              | Set the expert capacity factor.                                                                     | int  |
-| moe_config.num_experts_chosen             | Set the number of experts to select per token.                                                                                      | int  |
-| moe_config.enable_sdrop              | Set whether to enable token drop policy `sdrop`, since MindSpore Transformers's MoE is a static shape implementation so it can't retain all tokens.                                                                       | bool  |
-| moe_config.aux_loss_factor              | Set the weights of the equilibrium loss.                                                                       | list[float]  |
-| moe_config.first_k_dense_replace              | Set the enable block of the moe layer, generally set to 1 to indicate that moe is not enabled in the first block.                                                                       | int  |
-| moe_config.balance_via_topk_bias              | Set whether to enable `aux_loss_free` load balancing algorithm.                                                                                         | bool  |
-| moe_config.topk_bias_update_rate                   | Set `aux_loss_free` load balancing algorithm `bias` update step size.                                                                     | float  |
-| moe_config.comp_comm_parallel                   | Set whether to enable computational communication parallelism for ffn. Default value: False.                                                              | bool  |
-| moe_config.comp_comm_parallel_degree                   | Set ffn to compute the number of communication splits. The higher the number, the more overlap there is, but it will consume more memory. This parameter is only valid when comp_com_parallel is enabled.                               | int  |
-| moe_config.moe_shared_expert_overlap                   | Set whether to enable computational communication parallelism for shared experts and routing experts. Default value: False.                                                              | bool  |
-
-### Model Training Configuration
-
-When starting model training, in addition to model-related parameters, you also need to set the parameters of trainer, runner_config, learning rate, and optimizer and other modules required for training, MindSpore Transformers provides the following configuration items.
-
-| Parameters              | Descriptions                                                                                                                                                                                                                        | Types |
-|---------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------|
-| trainer.type                                | Set the trainer class, usually different models for different application scenarios will set different trainer classes.                                                                                                              | str   |
-| trainer.model_name                          | Set the model name in the format '{name}_xxb', indicating a certain specification of the model.                                                                                                                                      | str   |
-| runner_config.epochs                        | Set the number of rounds for model training.                                                                                                                                                                                         | int   |
-| runner_config.batch_size                    | Set the sample size of the batch data, which overrides the `batch_size` in the dataset configuration.                                                                                                                               | int   |
-| runner_config.sink_mode                     | Enable data sink mode.                                                                                                                                                                                                               | bool  |
-| runner_config.sink_size                     | Set the number of iterations to be sent down from Host to Device per iteration, effective only when `sink_mode=True`. This argument will be deprecated in a future release.                                                                             | int   |
-| runner_config.gradient_accumulation_steps   | Set the number of gradient accumulation steps, the default value is 1, which means that gradient accumulation is not enabled.                                                                                                       | int   |
-| runner_wrapper.type                         | Set the wrapper class, generally set 'MFTrainOneStepCell'.                                                                                                                                                                           | str   |
-| runner_wrapper.scale_sense.type             | Set the gradient scaling class, generally just set 'DynamicLossScaleUpdateCell'.                                                                                                                                                     | str   |
-| runner_wrapper.scale_sense.use_clip_grad    | Turn on gradient clipping. Turning on to avoid cases where the inverse gradient is too large and training fails to converge.                                                                                                         | bool  |
-| runner_wrapper.scale_sense.loss_scale_value | Set the loss dynamic scale factor, the model loss can change dynamically according to the configuration of this parameter.                                                                                                           | int   |
-| lr_schedule.type                            | Set the lr_schedule class, lr_schedule is mainly used to adjust the learning rate in model training.                                                                                                                                 | str   |
-| lr_schedule.learning_rate                   | Set the initialized learning rate size.                                                                                                                                                                                              | float |
-| lr_scale                                    | Whether to enable learning rate scaling.                                                                                                                                                                                             | bool  |
-| lr_scale_factor                             | Set the learning rate scaling factor.                                                                                                                                                                                                | int   |
-| layer_scale                                 | Whether to turn on layer attenuation.                                                                                                                                                                                                | bool  |
-| layer_decay                                 | Set the layer attenuation factor.                                                                                                                                                                                                    | float |
-| optimizer.type                              | Set the optimizer class, the optimizer is mainly used to calculate the gradient for model training.                                                                                                                                  | str   |
-| optimizer.weight_decay                      | Set the optimizer weight decay factor.                                                                                                                                                                                               | float |
-| train_dataset.batch_size                    | The description is same as that of `runner_config.batch_size`.                                                                                                                                                                       | int   |
-| train_dataset.input_columns                 | Set the input data columns for the training dataset.                                                                                                                                                                                 | list  |
-| train_dataset.output_columns                | Set the output data columns for the training dataset.                                                                                                                                                                                | list  |
-| train_dataset.column_order                  | Set the order of the output data columns of the training dataset.                                                                                                                                                                    | list  |
-| train_dataset.num_parallel_workers          | Set the number of processes that read the training dataset.                                                                                                                                                                          | int   |
-| train_dataset.python_multiprocessing        | Enabling Python multi-process mode to improve data processing performance.                                                                                                                                                           | bool  |
-| train_dataset.drop_remainder                | Whether to discard the last batch of data if it contains fewer samples than batch_size.                                                                                                                                             | bool  |
-| train_dataset.repeat                        | Set the number of dataset duplicates.                                                                                                                                                                                                | int   |
-| train_dataset.numa_enable                   | Set the default state of NUMA to data read startup state.                                                                                                                                                                            | bool  |
-| train_dataset.prefetch_size                 | Set the amount of pre-read data.                                                                                                                                                                                                     | int   |
-| train_dataset.data_loader.type              | Set the data loading class.                                                                                                                                                                                                          | str   |
-| train_dataset.data_loader.dataset_dir       | Set the path for loading data.                                                                                                                                                                                                       | str   |
-| train_dataset.data_loader.shuffle           | Whether to randomly sort the data when reading the dataset.                                                                                                                                                                          | bool  |
-| train_dataset.transforms                    | Set options related to data enhancement.                                                                                                                                                                                             | -     |
-| train_dataset_task.type                     | Set up the dataset class, which is used to encapsulate the data loading class and other related configurations.                                                                                                                      | str   |
-| train_dataset_task.dataset_config           | Typically set as a reference to `train_dataset`, containing all configuration entries for `train_dataset`.                                                                                                                          | -     |
-| auto_tune                                   | Enable auto-tuning of data processing parameters, see [set_enable_autotune](https://www.mindspore.cn/docs/en/r2.6.0/api_python/dataset/mindspore.dataset.config.set_enable_autotune.html) for details.                               | bool  |
-| filepath_prefix                             | Set the save path for parameter configurations after data optimization.                                                                                                                                                              | str   |
-| autotune_per_step                           | Set the configuration tuning step interval for automatic data acceleration, for details see [set_autotune_interval](https://www.mindspore.cn/docs/en/r2.6.0/api_python/dataset/mindspore.dataset.config.set_autotune_interval.html). | int   |
-
-### Parallel Configuration
-
-In order to improve the performance of the model, it is usually necessary to configure the parallelism strategy for the model in large-scale cluster usage scenarios. For details, please refer to [Distributed Parallelism](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/function/distributed_parallel.html), the parallel configuration in MindSpore Transformers is as follows.
-
-| Parameters              | Descriptions                                                                                                                                                                                                                      | Types   |
-|-----------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------|
-| use_parallel                                                    | Enable parallel mode.                                                                                                                                                                                           | bool |
-| parallel_config.data_parallel                                   | Set the number of data parallel.                                                                                                                                                                                          | int  |
-| parallel_config.model_parallel                                  | Set the number of model parallel.                                                                                                                                                                                          | int  |
-| parallel_config.context_parallel                                | Set the number of sequence parallel.                                                                                                                                                                                         | int  |
-| parallel_config.pipeline_stage                                  | Set the number of pipeline parallel.                                                                                                                                                                                         | int  |
-| parallel_config.micro_batch_num                                 | Set the pipeline parallel microbatch size, which should satisfy `parallel_config.micro_batch_num` >= `parallel_config.pipeline_stage` when `parallel_config.pipeline_stage` is greater than 1.                                                                     | int  |
-| parallel_config.seq_split_num                                   | Set the sequence split number in sequence pipeline parallel, which should be a divisor of sequence length.                                                                    | int  |
-| parallel_config.gradient_aggregation_group                      | Set the size of the gradient communication operator fusion group.                                                                                                                                                                                   | int  |
-| parallel_config.context_parallel_algo                      | Set the long sequence parallel scheme, optionally `colossalai_cp`, `ulysses_cp` and `hybrid_cp`, effective only if the number of `context_parallel` slices is greater than 1.    | str  |
-| parallel_config.ulysses_degree_in_cp                      | Setting the Ulysses sequence parallel dimension, configured in parallel with the `hybrid_cp` long sequence parallel scheme, requires ensuring that `context_parallel` is divisible by this parameter and greater than 1, and that `ulysses_degree_in_cp` is divisible by the number of attention heads.  | int  |
-| micro_batch_interleave_num                                      | Set the number of multicopy parallel, enable multicopy parallelism if it is greater than 1. Usually enabled when using model parallel, mainly used to optimize the communication loss generated by model parallel, and not recommended to be enabled when only using streaming parallel. For details, please refer to [MicroBatchInterleaved](https://www.mindspore.cn/docs/en/r2.6.0/api_python/parallel/mindspore.parallel.nn.MicroBatchInterleaved.html). | int  |
-| parallel.parallel_mode                                          | Set parallel mode, `0` means data parallel mode, `1` means semi-automatic parallel mode, `2` means automatic parallel mode, `3` means mixed parallel mode, usually set to semi-automatic parallel mode.                                                                                                                          | int  |
-| parallel.gradients_mean                                         | Whether to execute the averaging operator after the gradient AllReduce. Typically set to `False` in semi-automatic parallel mode and `True` in data parallel mode.                                                                                                                                        | bool |
-| parallel.enable_alltoall                                        | Enables generation of the AllToAll communication operator during communication. Typically set to `True` only in MOE scenarios, default value is `False`.                                                                                                                                             | bool |
-| parallel.full_batch                                             | Whether to load the full batch of data from the dataset in parallel mode. Setting it to `True` means all ranks will load the full batch of data. Setting it to `False` means each rank will only load the corresponding batch of data. When set to `False`, the corresponding `dataset_strategy` must be configured.                                                                                                                                                                                                                                                                                                      | bool  |
-| parallel.dataset_strategy                                       | Only supports `List of List` type and is effective only when `full_batch=False`. The number of sublists in the list must be equal to the length of `train_dataset.input_columns`. Each sublist in the list must have the same shape as the data returned by the dataset. Generally, data parallel splitting is done along the first dimension, so the first dimension of the sublist should be configured to match `data_parallel`, while the other dimensions should be set to `1`. For detailed explanation, refer to [Dataset Splitting](https://www.mindspore.cn/tutorials/en/r2.6.0/parallel/dataset_slice.html). | list  |
-| parallel.search_mode                                            | Set fully-automatic parallel strategy search mode, options are `recursive_programming`, `dynamic_programming` and `sharding_propagation`, only works in fully-automatic parallel mode, experimental interface.                                                                                         | str  |
-| parallel.strategy_ckpt_save_file                                | Set the save path for the parallel slicing strategy file.                                                                                                                                                                                  | str  |
-| parallel.strategy_ckpt_config.only_trainable_params             | Whether to save (or load) information about the slicing strategy for trainable parameters only, default is True, set this parameter to `False` when there are frozen parameters in the network but need to be sliced.                                                                                                                                   | bool |
-| parallel.enable_parallel_optimizer                              | Turn on optimizer parallel.<br/> 1. slice model weight parameters by number of devices in data parallel mode. <br/>2. slice model weight parameters by `parallel_config.data_parallel` in semi-automatic parallel mode.                                                                                          | bool |
-| parallel.parallel_optimizer_config.gradient_accumulation_shard  | Set whether the cumulative gradient variable is sliced on the data-parallel dimension, only effective if `enable_parallel_optimizer=True`.                                                                                                                                    | bool |
-| parallel.parallel_optimizer_config.parallel_optimizer_threshold | Set the threshold for the optimizer weight parameter cut, effective only if `enable_parallel_optimizer=True`.                                                                                                                                             | int  |
-| parallel.parallel_optimizer_config.optimizer_weight_shard_size  | Set the size of the optimizer weight parameter to slice the communication domain, requiring the value to be integrable by `parallel_config.data_parallel`, effective only if `enable_parallel_optimizer=True`.                                                                                                  | int  |
-| parallel.pipeline_config.pipeline_interleave  | Enable interleave pipeline parallel, you should set this variable to be `true` when using Seq-Pipe.                                       | bool  |
-| parallel.pipeline_config.pipeline_scheduler  | Set the scheduling strategy of Seq-Pipe, we only support `"seqpipe"` now.                                                       | str  |
-
-> Configure the parallel strategy to satisfy device_num = data_parallel × model_parallel × context_parallel × pipeline_stage.
-
-### Model Optimization Configuration
-
-1. MindSpore Transformers provides recomputation-related configurations to reduce the memory footprint of the model during training, see [Recomputation](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/perf_optimize/perf_optimize.html#recomputation) for details.
-
-   | Parameters                                         | Descriptions                                                                                            | Types           |
-   |----------------------------------------------------|---------------------------------------------------------------------------------------------------------|-----------------|
-   | recompute_config.recompute                         | Whether to enable recompute.                                                                                        | bool/list/tuple |
-   | recompute_config.select_recompute                  | Turn on recomputation to recompute only for the operators in the attention layer.                        | bool/list       |
-   | recompute_config.parallel_optimizer_comm_recompute | Whether to recompute AllGather communication introduced in parallel by the optimizer.                    | bool/list       |
-   | recompute_config.mp_comm_recompute                 | Whether to recompute communications introduced by model parallel.                                        | bool            |
-   | recompute_config.recompute_slice_activation        | Whether to output slices for Cells kept in memory.                                                       | bool            |
-   | recompute_config.select_recompute_exclude          | Disable recomputation for the specified operator, valid only for the Primitive operators.               | bool/list       |
-   | recompute_config.select_comm_recompute_exclude     | Disable communication recomputation for the specified operator, valid only for the Primitive operators. | bool/list       |
-
-2. MindSpore Transformers provides fine-grained activations SWAP-related configurations to reduce the memory footprint of the model during training, see [Fine-Grained Activations SWAP](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/function/fine_grained_activations_swap.html) for details.
-
-   | Parameters                                         | Descriptions                                                                                            | Types           |
-   |----------------------------------------------------|---------------------------------------------------------------------------------------------------------|-----------------|
-   | swap_config.swap | Enable activations SWAP. | bool |
-   | swap_config.default_prefetch | Control the timing of releasing memory in forward phase and starting prefetch in backward phase of the default SWAP strategy, only taking effect when swap=True, layer_swap=None, and op_swap=None. | int |
-   | swap_config.layer_swap | Select specific layers to enable activations SWAP. | list |
-   | swap_config.op_swap | Select specific operators within layers to enable activations SWAP. | list |
-
-### Callbacks Configuration
-
-MindSpore Transformers provides encapsulated Callbacks function class, mainly to achieve to return to the model training state and output in the model training process, save the model weight file and other operations. Currently, the following Callbacks function class is supported.
-
-1. MFLossMonitor
-
-   This callback function class is mainly used to print information such as training progress, model Loss, and learning rate during the training process and has several configurable items as follows:
-
-   | Parameters                     | Descriptions                                                                                                                                                                                                                                                                                                | Types |
-   |--------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------|
-   | learning_rate                  | Set the initial learning rate in `MFLossMonitor`. The default value is `None`.                                                                                                                                                                                                                               | float |
-   | per_print_times                | Set the frequency for printing log information in `MFLossMonitor`. The default value is `1`, that is, the log information is printed every step.                                                                                                                                                             | int   |
-   | micro_batch_num                | Set the size of the micro batch data in each step in the training, which is used to calculate the actual loss value. If this parameter is not set, the value of this parameter is the same as that of `parallel_config.micro_batch_num` in [Parallel Configuration](#parallel-configuration).                | int   |
-   | micro_batch_interleave_num     | Set the size of the interleave micro batch data in each step of the training. This parameter is used to calculate the actual loss value. If this parameter is not set, the value of this parameter is the same as that of `micro_batch_interleave_num` in [Parallel Configuration](#parallel-configuration). | int   |
-   | origin_epochs                  | Set the initial number of training epochs in `MFLossMonitor`. If this parameter is not set, the value of this parameter is the same as that of `runner_config.epochs` in [Model Training Configuration](#model-training-configuration).                                                                      | int   |
-   | dataset_size                   | Set initial size of the dataset in `MFLossMonitor`. If this parameter is not set, the size of the initialized dataset is the same as the size of the actual dataset used for training.                                                                                                                       | int   |
-   | initial_epoch                  | Set start epoch number of training in `MFLossMonitor`. The default value is `0`.                                                                                                                                                                                                                             | int   |
-   | initial_step                   | Set start step number of training in `MFLossMonitor`. The default value is `0`.                                                                                                                                                                                                                              | int   |
-   | global_batch_size              | Set the number of global batch data samples in `MFLossMonitor`. If this parameter is not set, the system automatically calculates the number of global batch data samples based on the dataset size and parallel strategy.                                                                                  | int   |
-   | gradient_accumulation_steps    | Set the number of gradient accumulation steps in `MFLossMonitor`. If this parameter is not set, the value of this parameter is the same as that of `gradient_accumulation_steps` in [Model Training Configuration](#model-training-configuration).                                                           | int   |
-   | check_for_nan_in_loss_and_grad | Whether to enable overflow detection in `MFLossMonitor`. After overflow detection is enabled, the training exits if overflow occurs during model training. The default value is `False`.                                                                                                                     | bool  |
-
-2. SummaryMonitor
-
-   This callback function class is mainly used to collect Summary data, see [mindspore.SummaryCollector](https://www.mindspore.cn/docs/en/r2.6.0/api_python/mindspore/mindspore.SummaryCollector.html) for details.
-
-3. CheckpointMonitor
-
-   This callback function class is mainly used to save the model weights file during the model training process and has several configurable items as follows:
-
-   | Parameters                   | Descriptions                                                                                                                                                                                                                                                                                                                                                                                                           | Types |
-   |------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------|
-   | prefix                       | Set the prefix for saving file names.                                                                                                                                                                                                                                                                                                                                                                                   | str   |
-   | directory                    | Set the directory for saving file names.                                                                                                                                                                                                                                                                                                                                                                                | str   |
-   | save_checkpoint_seconds      | Set the number of seconds between saving model weights.                                                                                                                                                                                                                                                                                                                                                                 | int   |
-   | save_checkpoint_steps        | Set the number of interval steps for saving model weights.                                                                                                                                                                                                                                                                                                                                                              | int   |
-   | keep_checkpoint_max          | Set the maximum number of model weight files to be saved, if there are more model weight files in the save path, they will be deleted starting from the earliest file created to ensure that the total number of files does not exceed `keep_checkpoint_max`.                                                                                                                                                          | int   |
-   | keep_checkpoint_per_n_minutes | Set the number of minutes between saving model weights.                                                                                                                                                                                                                                                                                                                                                                 | int   |
-   | integrated_save              | Turn on aggregation to save the weights file.<br/>1. When set to True, it means that the weights of all devices are aggregated when the weight file is saved, i.e., the weights of all devices are the same.<br/>2. False means that all devices save their own weights<br/>When using semi-automatic parallel mode, it is usually necessary to set it to False to avoid memory problems when saving the weights file. | bool  |
-   | save_network_params          | Set to save only model weights, default value is `False`.                                                                                                                                                                                                                                                                                                                                                              | bool  |
-   | save_trainable_params        | Set the additional saving of trainable parameter weights, i.e. the parameter weights of the model when partially fine-tuned, default to `False`.                                                                                                                                                                                                                                                                       | bool  |
-   | async_save                   | Set an asynchronous execution to save the model weights file.                                                                                                                                                                                                                                                                                                                                                          | bool  |
-   | remove_redundancy             | Whether to remove the redundancy for the checkpoint, default value is `False`.                                                                                                                                                                                                                                                                                                                                         | bool  |
-   | checkpoint_format           | The format of the checkpoint while saving the checkpoint, default value is `ckpt`. Either `ckpt` or `safetensors`.                                                                                                                                                                                                                                                                                                      | str   |
-
-Multiple Callbacks function classes can be configured at the same time under the `callbacks` field. The following is an example of `callbacks` configuration.
-
-```yaml
-callbacks:
-  - type: MFLossMonitor
-  - type: CheckpointMonitor
-    prefix: "name_xxb"
-    save_checkpoint_steps: 1000
-    integrated_save: False
-    async_save: False
-```
-
-### Processor Configuration
-
-Processor is mainly used to preprocess the inference data of the input model. Since the Processor configuration items are not fixed, only the generic configuration items of Processor in MindSpore Transformers are explained here.
-
-| Parameters              | Descriptions                                                                                                                                                                                                                      | Types   |
-|--------------------------------|--------------------------------------|-----|
-| processor.type                 | Set the data processing class.                              | str |
-| processor.return_tensors       | Set the type of tensor returned by the data processing class, typically use 'ms'.              | str |
-| processor.image_processor.type | Set the image data processing class.                            | str |
-| processor.tokenizer.type       | Set the text tokenizer class.                       | str |
-| processor.tokenizer.vocab_file | Set the path of the file to be read by the text tokenizer, which needs to correspond to the tokenizer class. | str |
-
-### Model Evaluation Configuration
-
-MindSpore Transformers provides model evaluation function, and also supports model evaluation while training. The following is the configuration related to model evaluation.
-
-| Parameters              | Descriptions                                                                                                                                                                                                                      | Types   |
-|---------------------|-------------------------------------------------------------|------|
-| eval_dataset        | Used in the same way as `train_dataset`.                                      | -    |
-| eval_dataset_task   | Used in the same way as `eval_dataset_task`.                                  | -    |
-| metric.type         | Used in the same way as `callbacks`.                                          | -    |
-| do_eval             | Enable evaluation while training.                                                | bool |
-| eval_step_interval  | Set evaluation step interval, default value is 100. The value less than 0 means disable evaluation according to step interval.                  | int  |
-| eval_epoch_interval | Set the epoch interval for evaluation, the default value is -1. The value less than 0 means disable the function of evaluating according to epoch interval, it is not recommended to use this configuration in data sinking mode. | int  |
-| metric.type         | Set the type of evaluation.                                                     | str  |
-
-### Profile Configuration
-
-MindSpore Transformers provides Profile as the main tool for model performance tuning, please refer to [Performance Tuning Guide](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/perf_optimize/perf_optimize.html) for more details. The following is the Profile related configuration.
-
-| Parameters            | Descriptions                                                                                                                                                                                                                            | Types |
-|-----------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------|
-| profile               | Whether to enable the performance capture tool, see [mindspore.Profiler](https://www.mindspore.cn/docs/en/r2.6.0/api_python/mindspore/mindspore.Profiler.html) for details. Default: `False`.                                           | bool  |
-| profile_start_step    | Set the number of steps to start collecting performance data. Default: `1`.                                                                                                                                                             | int   |
-| profile_stop_step     | Set the number of steps to stop collecting performance data. Default: `10`.                                                                                                                                                             | int   |
-| profile_communication | Set whether communication performance data is collected in multi-device training, this parameter is invalid when using single card training. Default: `False`.                                                                          | bool  |
-| profile_memory        | Set whether to collect Tensor memory data. Default: `True`.                                                                                                                                                                             | bool  |
-| profile_rank_ids      | Specify rank ids to enable collecting performance data. Defaults to `None`, which means all rank ids are enabled.                                                                                                                       | list  |
-| profile_pipeline      | Set whether to enable collecting performance data on one card of each parallel stage. Default: `False`.                                                                                                                                 | bool  |
-| profile_output        | Set the directory of saving performance data.                                                                                                                                                                                           | str   |
-| profile_level         | Set the collection level. Should be one of (0, 1, 2). Default: `1`.                                                                                                                                                                     | int   |
-| with_stack            | Set whether to collect Python-side stack trace data. Default: `False`.                                                                                                                                                                  | bool  |
-| data_simplification   | Set whether to enable data simplification, which will delete the FRAMEWORK directory and other extraneous data after exporting performance data. Default: `False`.                                                                      | int   |
-| init_start_profile    | Set whether to turn on collecting performance data when the Profiler is initialized; this parameter does not take effect when `profile_start_step` is set. This parameter needs to be set to `True` when `profile_memory` is turned on. | bool  |
-| mstx                  | Set whether to enable mstx timestamp recording, including training step, HCCL-operators and etc. Default: `False`.                                                                                                                                                                      | bool  |
-
-### Metric Monitoring Configuration
-
-The metric monitoring configuration is primarily used to configure methods to record metrics during training, please refer to [Training Metrics Monitoring](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/function/monitor.html) for more details.Below is a description of the common metric monitoring configuration options in MindSpore Transformers:
-
-| Parameters                              | Descriptions                                                                                                                                                                                                                                             | Types         |
-|-----------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------|
-| monitor_config.monitor_on               | Set whether to enable monitoring. The default is `False`, which will disable all parameters below.                                                                                                                                                       | bool          |
-| monitor_config.dump_path                | Set the save path for metric files of `local_norm`, `device_local_norm` and `local_loss` during training. Defaults to './dump' when not set or set to `null`.                                                                                             | str           |
-| monitor_config.target                   | Set the (partial) name of target parameters monitored by metric `optimizer state` and `local_norm`, can be regular expression.Defaults to ['.*'] when not set or set to `null`, that is, specify all parameters.                                         | list[str]     |
-| monitor_config.invert                   | Set whether to invert the targets specified in `monitor_config.target`, defaults to `False`.                                                                                                                                                             | bool          |
-| monitor_config.step_interval            | Set the frequency for metric recording. The default value is `1`, that is, the metrics are recorded every step.                                                                                                                                          | int           |
-| monitor_config.local_loss_format        | Set the format to record metric `local_loss`, can be string 'tensorboard' and 'log' (represent write to Tensorboard and write to log respectively), or list composed of them, or `null`. Defaults to `null`, that is, do not monitor this metric.        | str/list[str] |
-| monitor_config.local_norm_format        | Set the format to record metric `local_norm`, can be string 'tensorboard' and 'log' (represent write to Tensorboard and write to log respectively), or list composed of them, or `null`. Defaults to `null`, that is, do not monitor this metric.        | str/list[str] |
-| monitor_config.device_local_norm_format | Set the format to record metric `device_local_norm`, can be string 'tensorboard' and 'log' (represent write to Tensorboard and write to log respectively), or list composed of them, or `null`. Defaults to `null`, that is, do not monitor this metric. | str/list[str] |
-| monitor_config.optimizer_state_format   | Set the format to record metric `optimizer state`, can be string 'tensorboard' and 'log' (represent write to Tensorboard and write to log respectively), or list composed of them, or `null`. Defaults to `null`, that is, do not monitor this metric.   | str/list[str] |
-| monitor_config.weight_state_format      | Set the format to record metric `weight L2-norm`, can be string 'tensorboard' and 'log' (represent write to Tensorboard and write to log respectively), or list composed of them, or `null`. Defaults to `null`, that is, do not monitor this metric.    | str/list[str] |
-| monitor_config.throughput_baseline      | Set the baseline of metric `throughput linearity`, must be positive number. Defaults to `null`, that is, do not monitor this metric.                                                                                                                    | int/float     |
-| monitor_config.print_struct             | Set whether to print all trainable parameters' name of model. If set to `True`, print all trainable parameters' name at the beginning of the first step, and exit training process after step end. Defaults to `False`.                                  | bool          |
-
-### TensorBoard Configuration
-
-The TensorBoard configuration is primarily used to configure parameters related to TensorBoard during training, allowing for real-time monitoring and visualization of training metrics, please refer to [Training Metrics Monitoring](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/function/monitor.html) for more details. Below is a description of the common TensorBoard configuration options in MindSpore Transformers:
-
-| Parameters                                  | Descriptions                                                                                                                                                | Types  |
-|---------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------|--------|
-| tensorboard.tensorboard_dir                 | Set the save path for TensorBoard event. files.                                                                                                              | str    |
-| tensorboard.tensorboard_queue_size          | Set the largest number of summaries to keep in a queue; will flush once the queue gets bigger than this. Defaults to 10.                                    | int    |
-| tensorboard.log_loss_scale_to_tensorboard   | Set whether to record the loss scale information to the event file. The default is `False`.                                                                 | bool   |
-| tensorboard.log_timers_to_tensorboard       | Whether to log timer information to TensorBoard, including the duration and throughput of the current training step (or iteration). The default is `False`. | bool   |
-
diff --git a/docs/mindformers/docs/source_en/appendix/env_variables.md b/docs/mindformers/docs/source_en/appendix/env_variables.md
deleted file mode 100644
index eb4d388fc061032f6da372894e1eedd6d78d6a82..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_en/appendix/env_variables.md
+++ /dev/null
@@ -1,41 +0,0 @@
-# Environment Variable Descriptions
-
-[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_en/appendix/env_variables.md)
-
-The following environment variables are supported by MindSpore Transformers.
-
-## Debugging Variables
-
-| Variables Names                        | Default | Interpretations           | Descriptions          | Application Scenarios       |
-| ------------------------------- | ------ | ------------------------------------------------------ | -------------------------------- | ---------------------------------------------------------------------------------------------------------- |
-| **HCCL_DETERMINISTIC**          | false  | Whether to enable deterministic computation of reductive communication operators, where reductive communication operators include AllReduce, ReduceScatter, Reduce. | `true`: turns on the HCCL deterministic switch;<br> `false`: turns off the HCCL deterministic switch.  | Turning on deterministic computation eliminates the randomness introduced by inconsistent ordering of multi-card computations, but it results in a performance degradation compared to turning it off. It is recommended to turn it on in scenarios where consistency is required.                            |
-| **LCCL_DETERMINISTIC**          | 0      | whether to turn the LCCL deterministic operator AllReduce (order-preserving addition) on.                | `1`: turns on the LCCL deterministic switch;<br>`0`: turns off the LCCL deterministic switch.   | Turning on deterministic computation eliminates the randomness introduced by inconsistent ordering of multi-card computations, but it results in a performance degradation compared to turning it off. It is recommended to turn it on in scenarios where consistency is required. <br>Takes effect when rankSize<=8.                          |
-| **CUSTOM_MATMUL_SHUFFLE**       | on     | Whether to enable shuffle operations for custom matrix multiplication.                                                          | `on`: turns on matrix shuffle;<br>`off`: turns off matrix shuffle.   | The shuffle operation is optimized for specific matrix sizes and memory access patterns. If the matrix size does not match the shuffle-optimized size, turning off shuffling may result in better performance. Please set it according to the actual usage.                                                                                                      |
-| **ASCEND_LAUNCH_BLOCKING**      | 0      | training or online inference scenarios, this environment variable can be used to control whether synchronization mode is activated during operator execution.  | `1`: synchronized mode is mandatory;<br>`0`: synchronized mode is optional.       | Since the default operator executes asynchronously during NPU model training, when an error is reported during operator execution, the error stack information printed is not the actual call stack information. When set to `1`, synchronized mode is mandatory, which prints the correct call stack information and makes it easier to debug and locate problems in the code. Setting it to `1` provides more efficient arithmetic. |
-| **TE_PARALLEL_COMPILER**        | 8      | The number of threads on which the operator is compiled in parallel. Enables parallel compilation when greater than 1.       | Takes a positive integer;Maximum number of cpu cores\*80%/number of Ascend AI processors, value range 1~32, default value is 8.        | When the network model is large, parallel compilation of the operator can be turned on by configuring this environment variable;<br>setting it to `1` for single-threaded compilation simplifies the difficulty when debugging.     |
-| **CPU_AFFINITY**                | 0      | Turn on the CPU affinity switch, thus ensuring that each process or thread is bound to a single CPU core to improve performance.              | `1`: turn on the CPU affinity switch;<br>`0`: turn off the CPU affinity switch. | CPU affinity is turned off by default for **optimized resource utilization** and **energy saving**.    |
-| **MS_MEMORY_STATISTIC**         | 0      | Memory Statistics.                                                                                    | `1`: turn on memory statistics;<br>`0`: turn off memory statistics.   | During memory analysis, basic memory usage can be counted. You can refer to [Optimization Guide](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/perf_optimize/perf_optimize.html) for details.                                                                           |
-| **MINDSPORE_DUMP_CONFIG**       | NA      | Specify the path to the configuration file that the [cloud-side Dump function](https://www.mindspore.cn/tutorials/en/r2.6.0/debug/dump.html) or [end-side Dump function](https://www.mindspore.cn/lite/docs/en/r2.6.0/tools/benchmark_tool.html#dump) depends on.    | File path, support relative path and absolute path.                                                                                                                                       |
-| **GLOG_v**                      | 3      | Controls the level of MindSpore logs.                                                                     | `0`: DEBUG <br>`1`: INFO <br>`2`: WARNING <br>`3`: ERROR: indicates that an error has been reported in the execution of the program, an error log is output, and the program may not be terminated;<br>`4`: CRITICAL, indicates that an exception has occurred in the execution of the program, and the execution of the program will be terminated. |
-| **ASCEND_GLOBAL_LOG_LEVEL**     | 3      | Controls the logging level of CANN.    | `0`: DEBUG <br>`1`: INFO <br>`2`: WARNING <br>`3`: ERROR <br>`4`: NULL, no log is output.     |
-| **ASCEND_SLOG_PRINT_TO_STDOUT** | 0      | Whether to display on the screen. When turned on, the logs will not be saved in the log file, but the generated logs will be displayed directly on the screen.                                                                   | `1`: Display on the screen <br>`0`: Do not display on the screen                                                                                                                                   |
-| **ASCEND_GLOBAL_EVENT_ENABLE**  | 0      | Whether to enable event logging.                                                                       | `1`: turn on Event logging;<br>`0`: turn off Event logging.                                                                                                                                 |
-| **HCCL_EXEC_TIMEOUT**           | 1836   | This environment variable allows you to control the amount of time to wait for synchronization when executing between devices, where each device process waits for the other device to perform communication synchronization for the configured amount of time.| The range is: (0, 17340], and the default value is 1836 in s.                                                                         |
-| **HCCL_CONNECT_TIMEOUT**        | 120    | Used in distributed training or inference scenarios to limit the timeout wait time of the socket building process between different devices. | The environment variable needs to be configured as an integer in the range [120,7200], with default value 120s.     |
-| **MS_NODE_ID**        | NA    | Specifies process rank id in dynamic cluster scenarios. | The rank_id of the process, unique within the cluster. |
-
-## Other Variables
-
-| Variables Names                    | Default     | Interpretations                                                             | Descriptions                                                                                                                                                                                                                  | Application Scenarios       |
-|------------------------------------|-------------|-----------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| ---------------------------------------------------------------------------------------------------------- |
-| **RUN_MODE**                       | predict     | Set the running mode.                                                       | `predict`: inference <br>`finetune`: Fine-tuning <br>`train`: Training <br>`eval`: Evaluation                                                                                                                                 |
-| **USE_ROPE_SELF_DEFINE**           | true        | Whether to enable ROPE fusion operator.                                     | `true`: enable ROPE fusion operator;<br>`false`: disable ROPE fusion operator.                                                                                                                                                | Enabling the ROPE fusion operator by default can improve the computation efficiency. Except for debugging scenarios, turn it off as needed, and generally do not make special settings.  |
-| **MS_ENABLE_INTERNAL_BOOST**       | on         | Whether to turn on the internal acceleration of the MindSpore framework.    | `on`: turn on MindSpore internal acceleration;<br> `off`: turn off MindSpore internal acceleration.                                                                                                                           | In order to achieve high-performance inference, this parameter is turned on by default. In cases where debugging or comparing different acceleration strategies is performed, this parameter needs to be turned off to observe the impact on performance.                                                          |
-| **MS_GE_ATOMIC_CLEAN_POLICY**      | 1           | Whether to clean up the memory occupied by atomic operators in the network. | `0`: centralized cleanup of memory occupied by all atomic operators in the network;<br>`1`: no centralized memory cleanup, individual zeroing of each atomic operator in the network.                                         | The switch is set to `1` by default, which makes it easy for the user to process each operator individually, allowing operations such as operator memory reuse. Setting it to `0` centrally cleans up the memory occupied by the operators.                                    |
-| **ENABLE_LAZY_INLINE**             | 1           | Whether to enable lazy inline.                                              | `0`: turn off lazy inline;<br>`1`: turn on lazy inline.                                                                                                                                                                       | Available under mindspore ≥ 2.2.0. It is usually used during pipeline parallelism to improve compilation performance. It is enabled by default and can be configured to be disabled.    |
-| **ENABLE_LAZY_INLINE_NO_PIPELINE** | 0           | Whether to enable lazy inline under non-pipeline parallel.                  | `0`: turn off lazy inline;<br>`1`: turn on lazy inline.                                                                                                                                                                       |The lazy inline feature is only enabled in pipeline parallel mode by default. To enable lazy inline in other parallel modes, set this environment variable to 1.                                                              |
-| **MS_ASCEND_CHECK_OVERFLOW_MODE**  | INFNAN_MODE | Sets the overflow detection mode.                                           | `SATURATION_MODE`: saturation mode, saturates to floating-point extremes (+-MAX) when the calculation overflows;<br> `INFNAN_MODE`: INF/NAN mode, follows the IEEE 754 standard, and outputs INF/NAN calculations as defined. | In large model tuning, the overflow state is aligned PyTorch and it is recommended to use INFNAN_MODE, i.e. export MS_ASCEND_CHECK_OVERFLOW_MODE=INFNAN_MODE.<br>Try setting this variable to INFNAN_MODE when encountering persistent overflow problems. |
-| **MF_LOG_SUFFIX**                  | NA          | Set custom suffixes for all log log folders.                                | Suffix for the log folder. Default: no suffix                                                                                                                                                                                 | Adding a consistent suffix isolates logs across tasks from being overwritten.         |
-| **PLOG_REDIRECT_TO_OUTPUT**        | False       | Controls whether plog logs change storage paths.                            | `True`: store the logs in the ./output directory; <br>`False`: Store to the default storage location.                                                                                                                         | This setting makes it easier to query the plog log.   |
-| **MS_ENABLE_FA_FLATTEN**           | on          | Controls whether support FlashAttention flatten optimization.               | `on`: Enable FlashAttention flatten optimization; <br>`off`: Disable FlashAttention flatten optimization.                                                                                                                     | Provide a fallback mechanism for models that have not yet been adapted to FlashAttention flatten optimization.   |
-| **EXPERIMENTAL_KERNEL_LAUNCH_GROUP**          | NA          | Control whether to support the batch parallel submission of operators. If supported, enable the parallel submission and configure the number of parallel submissions. | `thread_num`: The number of concurrent threads is not recommended to be increased. The default value is 2; <br> `kernel_group_num`: Total number of operator groups, 'kernel_group_num/thread_num' groups per thread, default is' 8 '.              | This feature will continue to evolve in the future, and the subsequent behavior may change. Currently, only the `deepseek` reasoning scenario is supported, with certain performance optimization, but other models using this feature may deteriorate, and users need to use it with caution, as follows:`export EXPERIMENTAL_KERNEL_LAUNCH_GROUP="thread_num:2,kernel_group_num:8"`.            |
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_en/conf.py b/docs/mindformers/docs/source_en/conf.py
deleted file mode 100644
index 55d99213de9575f9214c4786a6a261422a23e7ce..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_en/conf.py
+++ /dev/null
@@ -1,300 +0,0 @@
-# Configuration file for the Sphinx documentation builder.
-#
-# This file only contains a selection of the most common options. For a full
-# list see the documentation:
-# https://www.sphinx-doc.org/en/master/usage/configuration.html
-
-# -- Path setup --------------------------------------------------------------
-
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-#
-import os
-import re
-import shutil
-import sys
-import sphinx
-from sphinx.ext import autodoc as sphinx_autodoc
-import sphinx.ext.autosummary.generate as g
-
-# Fix some dl-label lack class='simple'
-from docutils.writers import _html_base
-
-with open(_html_base.__file__, "r", encoding="utf-8") as f:
-    code_str = f.read()
-    old_str = '''        if self.is_compactable(node):
-            classes.append('simple')'''
-    new_str = '''        if classes == []:
-            classes.append('simple')'''
-    code_str = code_str.replace(old_str, new_str)
-    exec(code_str, _html_base.__dict__)
-
-# -- Project information -----------------------------------------------------
-
-project = 'MindSpore'
-copyright = 'MindSpore'
-author = 'MindSpore'
-
-# The full version, including alpha/beta/rc tags
-release = 'master'
-
-
-# -- General configuration ---------------------------------------------------
-
-# Add any Sphinx extension module names here, as strings. They can be
-# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
-# ones.
-myst_enable_extensions = ["dollarmath", "amsmath"]
-
-
-myst_heading_anchors = 5
-extensions = [
-    'sphinx.ext.autodoc',
-    'sphinx.ext.autosummary',
-    'sphinx.ext.doctest',
-    'sphinx.ext.intersphinx',
-    'sphinx.ext.todo',
-    'sphinx.ext.coverage',
-    'sphinx.ext.napoleon',
-    'sphinx.ext.viewcode',
-    'myst_parser',
-    'nbsphinx',
-    'sphinx.ext.mathjax',
-    'IPython.sphinxext.ipython_console_highlighting'
-]
-
-source_suffix = {
-    '.rst': 'restructuredtext',
-    '.md': 'markdown',
-}
-
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-# This pattern also affects html_static_path and html_extra_path.
-mathjax_path = 'https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/mathjax/MathJax-3.2.2/es5/tex-mml-chtml.js'
-
-mathjax_options = {
-    'async':'async'
-}
-
-nbsphinx_requirejs_path = 'https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js'
-
-nbsphinx_requirejs_options = {
-    "crossorigin": "anonymous",
-    "integrity": "sha256-1fEPhSsRKlFKGfK3eO710tEweHh1fwokU5wFGDHO+vg="
-}
-
-smartquotes_action = 'De'
-
-exclude_patterns = []
-
-pygments_style = 'sphinx'
-
-autodoc_inherit_docstrings = False
-
-autosummary_generate = True
-
-autosummary_generate_overwrite = False
-
-# -- Options for HTML output -------------------------------------------------
-
-# The theme to use for HTML and HTML Help pages.  See the documentation for
-# a list of builtin themes.
-#
-html_theme = 'sphinx_rtd_theme'
-
-import sphinx_rtd_theme
-layout_target = os.path.join(os.path.dirname(sphinx_rtd_theme.__file__), 'layout.html')
-layout_src = '../../../../resource/_static/layout.html'
-if os.path.exists(layout_target):
-    os.remove(layout_target)
-shutil.copy(layout_src, layout_target)
-
-# Example configuration for intersphinx: refer to the Python standard library.
-intersphinx_mapping = {
-    'python': ('https://docs.python.org/', '../../../../resource/python_objects.inv'),
-    'numpy': ('https://docs.scipy.org/doc/numpy/', '../../../../resource/numpy_objects.inv'),
-}
-
-# overwriteautosummary_generate add view source for api.
-with open('../_ext/overwriteautosummary_generate.txt', 'r', encoding="utf8") as f:
-    exec(f.read(), g.__dict__)
-
-# Modify default signatures for autodoc.
-autodoc_source_path = os.path.abspath(sphinx_autodoc.__file__)
-autodoc_source_re = re.compile(r'stringify_signature\(.*?\)')
-get_param_func_str = r"""\
-import re
-import inspect as inspect_
-
-def get_param_func(func):
-    try:
-        source_code = inspect_.getsource(func)
-        all_params = ''
-        if hasattr(func, '__dataclass_fields__'):
-            for k, v in getattr(func, '__dataclass_fields__').items():
-                if hasattr(v, 'default'):
-                    if isinstance(v.default, str):
-                        all_params += f"{k} = '{v.default}', "
-                    else:
-                        all_params += f"{k} = {v.default}, "
-                else:
-                    all_params += f"{k}, "
-            all_params = all_params.strip(', ')
-        else:
-            if func.__doc__:
-                source_code = source_code.replace(func.__doc__, '')
-            all_params_str = re.findall(r"def [\w_\d\-]+\(([\S\s]*?)(\):|\) ->.*?:)", source_code)
-            if "@classmethod" in source_code or "def __new__" in source_code:
-                all_params = re.sub("(self|cls)(,|, )?", '', all_params_str[0][0].replace("\n", ""))
-                if ',' in all_params_str[0][0]:
-                    all_params = re.sub("(self|cls)(, |,)", '', all_params_str[0][0].replace("\n", ""))
-            else:
-                all_params = re.sub("(self)(,|, )?", '', all_params_str[0][0].replace("\n", ""))
-                if ',' in all_params_str[0][0]:
-                    all_params = re.sub("(self)(, |,)", '', all_params_str[0][0].replace("\n", ""))
-        return all_params
-    except:
-        return ''
-
-def get_obj(obj):
-    if getattr(obj, '__dataclass_fields__', None):
-        return obj
-
-    if isinstance(obj, type):
-        try:
-            test_source = inspect_.getsource(obj.__init__)
-        except:
-            return obj.__new__
-        obj_init = getattr(obj, '__init__', None)
-        if obj.__name__ not in str(obj_init) and hasattr(obj, '__new__'):
-            return obj.__new__
-        return obj.__init__
-
-    return obj
-"""
-
-with open(autodoc_source_path, "r+", encoding="utf8") as f:
-    code_str = f.read()
-    code_str = autodoc_source_re.sub('"(" + get_param_func(get_obj(self.object)) + ")"', code_str, count=0)
-    exec(get_param_func_str, sphinx_autodoc.__dict__)
-    exec(code_str, sphinx_autodoc.__dict__)
-
-# add @functools.wraps
-try:
-    decorator_list = [("mindformers/tools/logger.py", "__call__", "wrapper"),
-                      ("mindformers/version_control.py", "get_lazy_inline", "decorator")]
-
-    base_path = os.path.dirname(os.path.dirname(sphinx.__file__))
-    for i in decorator_list:
-        with open(os.path.join(base_path, os.path.normpath(i[0])), "r+", encoding="utf8") as f:
-            content = f.read()
-            new_content = re.sub('(import .*\n)', r'\1import functools\n', content, 1)
-            new_content = re.sub(f'def ({i[1]})\((.*?)\):\n(((?!wraps).|\n)*?)([ ]+?)def {i[2]}\(',
-                             rf'def \1(\2):\n\3\5@functools.wraps(\2)\n\5def {i[2]}(', new_content)
-            new_content = re.sub('@functools.wraps\((self|cls),[ ]*', r'@functools.wraps(', new_content)
-            if new_content != content:
-                f.seek(0)
-                f.truncate()
-                f.write(new_content)
-except:
-    pass
-
-re_url = r"(((gitee.com/mindspore/docs)|(github.com/mindspore-ai/(mindspore|docs))|" + \
-         r"(mindspore.cn/(docs|tutorials|lite))|(obs.dualstack.cn-north-4.myhuaweicloud)|" + \
-         r"(mindspore-website.obs.cn-north-4.myhuaweicloud))[\w\d/_.-]*?)/(master)"
-
-re_url2 = r"(gitee.com/mindspore/mindspore[\w\d/_.-]*?)/(master)"
-
-re_url3 = r"(((gitee.com/mindspore/golden-stick)|(mindspore.cn/golden_stick))[\w\d/_.-]*?)/(master)"
-
-re_url4 = r"(((gitee.com/mindspore/mindformers)|(mindspore.cn/mindformers))[\w\d/_.-]*?)/(dev)"
-
-for cur, _, files in os.walk(os.path.join(base_path, 'mindformers')):
-    for i in files:
-        if i.endswith('.py'):
-            with open(os.path.join(cur, i), 'r+', encoding='utf-8') as f:
-                content = f.read()
-                new_content = re.sub(re_url, r'\1/r2.6.0', content)
-                new_content = re.sub(re_url2, r'\1/v2.6.0', new_content)
-                new_content = re.sub(re_url3, r'\1/r1.1.0', new_content)
-                new_content = re.sub(re_url4, r'\1/r1.5.0', new_content)
-                if new_content != content:
-                    f.seek(0)
-                    f.truncate()
-                    f.write(new_content)
-
-import mindformers
-
-# Copy source files of chinese python api from golden-stick repository.
-from sphinx.util import logging
-import shutil
-logger = logging.getLogger(__name__)
-
-src_dir_api = os.path.join(os.getenv("MFM_PATH"), 'docs/api/api_python_en')
-moment_dir=os.path.dirname(__file__)
-
-for root,dirs,files in os.walk(src_dir_api):
-    for file in files:
-        if os.path.exists(os.path.join(moment_dir,file)):
-            os.remove(os.path.join(moment_dir,file))
-        shutil.copy(os.path.join(src_dir_api,file),os.path.join(moment_dir,file))
-
-if os.path.exists('./mindformers.experimental.rst'):
-    os.remove('./mindformers.experimental.rst')
-
-# get params for add view source
-import json
-
-if os.path.exists('../../../../tools/generate_html/version.json'):
-    with open('../../../../tools/generate_html/version.json', 'r+', encoding='utf-8') as f:
-        version_inf = json.load(f)
-elif os.path.exists('../../../../tools/generate_html/daily_dev.json'):
-    with open('../../../../tools/generate_html/daily_dev.json', 'r+', encoding='utf-8') as f:
-        version_inf = json.load(f)
-elif os.path.exists('../../../../tools/generate_html/daily.json'):
-    with open('../../../../tools/generate_html/daily.json', 'r+', encoding='utf-8') as f:
-        version_inf = json.load(f)
-
-if os.getenv("MFM_PATH").split('/')[-1]:
-    copy_repo = os.getenv("MFM_PATH").split('/')[-1]
-else:
-    copy_repo = os.getenv("MFM_PATH").split('/')[-2]
-
-branch = [version_inf[i]['branch'] for i in range(len(version_inf)) if version_inf[i]['name'] == copy_repo.replace('-','_')][0]
-docs_branch = [version_inf[i]['branch'] for i in range(len(version_inf)) if version_inf[i]['name'] == 'tutorials'][0]
-cst_module_name = 'mindformers'
-repo_whl = 'mindformers'
-giturl = 'https://gitee.com/mindspore/'
-
-def setup(app):
-    app.add_config_value('docs_branch', '', True)
-    app.add_config_value('branch', '', True)
-    app.add_config_value('cst_module_name', '', True)
-    app.add_config_value('copy_repo', '', True)
-    app.add_config_value('giturl', '', True)
-    app.add_config_value('repo_whl', '', True)
-
-sys.path.append(os.path.abspath('../../../../resource/sphinx_ext'))
-# import anchor_mod
-import nbsphinx_mod
-
-sys.path.append(os.path.abspath('../../../../resource/search'))
-import search_code
-
-# src_release = os.path.join(os.getenv("MFM_PATH"), 'RELEASE.md')
-# des_release = "./RELEASE.md"
-# with open(src_release, "r", encoding="utf-8") as f:
-#     data = f.read()
-# if len(re.findall("\n## (.*?)\n",data)) > 1:
-#     content = re.findall("(## [\s\S\n]*?)\n## ", data)
-# else:
-#     content = re.findall("(## [\s\S\n]*)", data)
-# #result = content[0].replace('# MindSpore', '#', 1)
-# with open(des_release, "w", encoding="utf-8") as p:
-#     p.write("# Release Notes"+"\n\n")
-#     p.write(content[0])
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_en/faq/func_related.md b/docs/mindformers/docs/source_en/faq/func_related.md
deleted file mode 100644
index f220efc994b2769b33443924457613447d5c2f00..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_en/faq/func_related.md
+++ /dev/null
@@ -1,21 +0,0 @@
-# Function-Related
-
-[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_en/faq/func_related.md)
-
-## Q: The WikiText dataset download link is not available.
-
-A: The official download link is not available, please follow the community Issue [#IBV35D](https://gitee.com/mindspore/mindformers/issues/IBV35D).
-
-<br/>
-
-## Q: How Do I Generate a Model Sharding Strategy File?
-
-A: The model sharding strategy file documents the sharding strategy for model weights in distributed scenarios and is generally used when slicing weights offline. Configure `only_save_strategy: True` in the network `yaml` file, and then start the distributed task normally, then the distributed strategy file can be generated in the `output/strategy/` directory. For details, please refer to the [Tutorial on Slicing and Merging Distributed Weights](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/function/transform_weight.html).
-
-<br/>
-
-## Q: How Can I Do When `socket.gaierror: [Errno -2] Name or service not known` or `socket.gaierror: [Errno -3] Temporary failure in name resolution` is Reported in `ranktable` Generation File?
-
-A: Starting from `MindSpore Transformers r1.2.0` version, cluster startup is unified using `msrun` method, and `ranktable` startup method is deprecated.
-
-<br/>
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_en/faq/mindformers_contribution.md b/docs/mindformers/docs/source_en/faq/mindformers_contribution.md
deleted file mode 100644
index a27a71dfda7c22c83e260082de1dd47ae49f79ef..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_en/faq/mindformers_contribution.md
+++ /dev/null
@@ -1,154 +0,0 @@
-# MindSpore Transformers Contribution Guidelines
-
-[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_en/faq/mindformers_contribution.md)
-
-## Contributing Code to MindSpore Transformers
-
-### Code Style Requirements
-
-Please follow this style for MindSpore Transformers review, maintenance and development.
-
-- Coding Guide
-
-  The MindSpore Transformers community uses the `Python PEP 8` coding style. It is recommended to install the following plugins in your IDE to check code format: `Lizard`, `ShellCheck` and `PyLint`.
-
-- Unit Testing Guide
-
-  The MindSpore Transformers community uses the Python unit testing framework pytest. Annotation names need to reflect the design intent of the test case.
-
-- Reconstruction Guide
-
-  We encourage developers to reconstruct our code to eliminate code bad taste. All code should conform to coding style and testing style, and reconstructing code is no exception. The Lizard threshold for uncommented lines of code (nloc) is 100, and the circle complexity (cnc) threshold is 20. when a Lizard warning is received, the code to be merged must be reconstructed.
-
-- Documentation Guide
-
-  We use MarkdownLint to check Markdown document format. The following rules are modified based on the default configuration:
-
-  1. MD007 (unordered list indent): the parameter indent is set to 4, indicating that all the contents of the unordered list need to be indented by 4 spaces.
-  2. MD009 (space at the end of the line): the parameter br_spaces is set to 2, indicating that there can be either 0 or 2 spaces at the end of the line.
-  3. MD029 (sequence number of ordered list): the parameter style is set to ordered, indicating ascending order.
-
-### Fork-Pull Development Model Guide
-
-- Fork MindSpore Transformers code repository
-
-  Before submitting code to the MindSpore Transformers project, please make sure that you have forked this project to your own code repository. There may be parallel development between the MindSpore Transformers code repository and your own code repository, so please be aware of the consistency between them.
-
-- Clone remote code repository
-
-  If you want to download the code to your local computer, it is best to use the git method.
-
-  ```shell
-  # Clone repositories on Gitee
-  git clone https://gitee.com/(insert_your_forked_repo)/mindformers.git
-  ```
-
-- Local Development Code
-
-  `dev` is the development branch. Please pull the latest code from `dev` branch for development. And submit it to the `dev` branch when you submit your Pull Request.
-
-  ```shell
-  git checkout -b {new branch name} origin/dev
-  ```
-
-- Submit PR to MindSpore Transformers code repository
-
-  In the last step, you need to pull a compare request between the new branch and the `MindSpore Transformers` master branch. After completing the pull request, `Jenkins CI` will be automatically set up for build testing. PR should be merged into the upstream dev branch as soon as possible to minimize the risk of merging.
-
-  ```shell
-  # Add all changes to the staging area
-  git add
-
-  # Check Update Status
-  git status
-
-  # To commit changes, add a commit header with the -m option
-  git commit -m "The title of your commit"
-
-  # Add a specific description of the commit, add a signature with the -s option, and modify the most recent commit with the -amend option.
-  git commit -s --amend
-
-  # Push changes to a new branch in the remote repository
-  git push origin {New branch name}
-
-  ```
-
-### Documentation and Code Format
-
-If you wish to merge custom models into the `MindSpore Transformers` code repository, there are a few things to keep in mind:
-
-1. The file format and location should follow the norms.
-2. Register the new model in the code to adapt it for higher-order interface use.
-
-#### File Format and Location
-
-1. The model code files are placed uniformly in the `research/{model_name}` folder in the following format.
-
-    ```plaintext
-    research/{model_name}
-    ├── {model_name}
-    | ├── {pretrain/finetune/predict}_{model_name}_{n}b.yaml
-    | ├── convert_weight.py # Torch weights to MindSpore weights script (required for migration models)
-    | ├── convert_reversed.py # MindSpore weights to Torch weights script (required for migration models)
-    | ├── run_{model_name}.py # Running the code file
-    | ├── {model_name}.py   # Model class code file
-    | └── {model_name}_tokenizer.py # Tokenizer Code File
-    ```
-
-2. Model documentation is placed in the same `research/{model_name}` folder.
-
-## Requirements for Submitting A PR
-
-### Only One Commit
-
-For multi-commit PRs, use the `squash` command to merge multiple commits into one. For example use:
-
-```shell
-git rebase -i HEAD~3
-```
-
-You can see:
-
-```shell
-pick 1234567 Add new function A
-pick 89abcdef Fixed bugs in A
-pick 01234567 Some optimizations to A
-```
-
-squash merge commit (can be simplified to abbreviations such as s, p, f, etc.)
-
-```shell
-pick 1234567 Add new function A
-pick 89abcdef Fixed bugs in A
-pick 01234567 Some optimizations to A
-```
-
-### PR Descriptions
-
-Please use the following md template.
-
-```markdown
-
-### Related Issue
-
-### Reason (purpose, problem solved, etc.)
-
-### Description (what was done, what was changed)
-
-### check list
-
-#### Was a program review or root cause analysis of the problem completed (Y/N)
-
-#### Whether UT/ST of functional modules was completed, executed and passed with results attached (Y/N)
-
-#### Whether it involves modification of public components or external interfaces, and if so, the scope of modification and impact assessment should be given (Y/N)
-
-#### Whether it involves the modification of information, and if so, the modification should be synchronized (Y/N)
-
-```
-
-### Access Control Requirements
-
-1. Submitting a PR requires [signing a CLA](https://www.mindspore.cn/icla).
-
-2. Submitting a PR requires passing the CI check, which needs to be manually restarted by commenting `/retest` under comments after the gate fails and the code is corrected.
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_en/faq/model_related.md b/docs/mindformers/docs/source_en/faq/model_related.md
deleted file mode 100644
index 2213b193605489707c897be2fcdada09aca9e501..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_en/faq/model_related.md
+++ /dev/null
@@ -1,17 +0,0 @@
-# Model-Related
-
-[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_en/faq/model_related.md)
-
-## Q: How to deal with network runtime error “Out of Memory” (`OOM`)?
-
-A: First of all, the above error refers to insufficient memory on the device, which may be caused by a variety of reasons, and it is recommended to carry out the following aspects of the investigation.
-
-1. Use the command `npu-smi info` to verify that the card is exclusive.
-2. It is recommended to use the default `yaml` configuration for the corresponding network when running network.
-3. Increase the value of `max_device_memory` in the corresponding `yaml` configuration file of the network. Note that some memory needs to be reserved for inter-card communication, which can be tried with incremental increases.
-4. Adjust the hybrid parallelism strategy, increase pipeline parallelism (pp) and model parallelism (mp) appropriately, and reduce data parallelism (dp) accordingly, keep `dp * mp * pp = device_num`, and increase the number of NPUs if necessary.
-5. Try to reduce batch size or sequence length.
-6. Turn on selective recalculation or full recalculation, turn on optimizer parallelism.
-7. If the problem still needs further troubleshooting, please feel free to [raise issue](https://gitee.com/mindspore/mindformers/issues) for feedback.
-
-<br/>
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_en/faq/modelers_contribution.md b/docs/mindformers/docs/source_en/faq/modelers_contribution.md
deleted file mode 100644
index 54155e0a80e2dd76fbc4cc3226e2ad9173044613..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_en/faq/modelers_contribution.md
+++ /dev/null
@@ -1,103 +0,0 @@
-# Modelers Contribution Guidelines
-
-[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_en/faq/modelers_contribution.md)
-
-## Upload a Model to the Modelers Community
-
-Modelers Community is a model hosting platform where users can upload custom models to [Modelers Community](https://modelers.cn/) for hosting.
-
-### MindSpore Transformers Built-in Models
-
-If the custom model uses a built-in model provided by MindSpore Transformers, i.e. a model whose model code is located under mindformers/models, and no modifications have been made to the model's structure code. You only need to upload the weight file and configuration.
-
-For example, if a user uses MindSpore Transformers built-in ChatGLM2 model, performs fine-tuning training, and wants to share the fine-tuned model weights, uploading the model configuration and weights file is sufficient.
-
-Below is sample code that saves the model configuration and weights:
-
-```python
-import mindspore as ms
-from mindformers import ChatGLM2Config, ChatGLM2ForConditionalGeneration
-
-config = ChatGLM2Config()
-model = ChatGLM2ForConditionalGeneration(config)
-ms.load_checkpoint("path/model.ckpt", model)  # Load custom weights
-
-model.save_pretrained("./my_model", save_json=True)
-```
-
-The above code runs and saves the config.json file and the mindspore_model.ckpt file (larger weights are automatically split and saved).
-
-After saving, you can use the openmind_hub library for model uploading. See [Model Upload](https://modelers.cn/docs/zh/best-practices/community_contribution/model_contribution.html#%E4%BD%BF%E7%94%A8openmind-hub-client%E4%B8%8A%E4%BC%A0%E6%A8%A1%E5%9E%8B).
-
-```python
-import openmind_hub
-
-openmind_hub.upload_folder(
-    folder_path="/path/to/local/folder",
-    repo_id="username/your-model-name",
-    token="your-token",
-)
-```
-
-An uploaded example can be found in the [OpenLlama model](https://modelers.cn/models/MindSpore-Lab/llama_7b/tree/main) of the Modelers community.
-
-### Custom Models
-
-If the user has customized model code, you need to upload the model code file at the same time and add a mapping in the json configuration file so that it can be imported through the Auto class.
-
-#### Naming Rules
-
-Custom code files uploaded to the community generally have uniform naming rules. Assuming the custom model is named model, its code naming should be as follows:
-
-```text
----- model
-    |- configuration_model.py  # Config class code files
-    |- modeling_model.py       # Model class code files
-    |- tokenization_model.py   # Tokenizer code files
-```
-
-#### Adding auto Mapping
-
-In order for the Auto class to be able to find the user-defined model class when it is used, you need to add the auto mapping in the config.json file. The contents of the additions are as follows:
-
-```json
-{
-  "auto_map": {
-    "AutoConfig": "configuration_model.MyConfig",
-    "AutoModel": "modeling_model.MyModel",
-    "AutoModelForCausalLM": "modeling_model.MyModelForCausalLM",
-  },
-}
-```
-
-If there is a custom tokenizer, the tokenizer needs to be saved:
-
-```python
-tokenizer.save_pretrained("./my_model", save_json=True)
-```
-
-And add auto mapping to the saved tokenizer_config.json:.
-
-```json
-{
-  "auto_map": {
-    "AutoTokenizer": ["tokenization_model.MyTokenizer", "tokenization_model.MyFastTokenizer"]
-  },
-}
-```
-
-#### Uploading the Model
-
-Model uploading can be done using the openmind_hub library. See [Model Upload](https://modelers.cn/docs/zh/best-practices/community_contribution/model_contribution.html#%E4%BD%BF%E7%94%A8openmind-hub-client%E4%B8%8A%E4%BC%A0%E6%A8%A1%E5%9E%8B).
-
-```python
-import openmind_hub
-
-openmind_hub.upload_folder(
-    folder_path="/path/to/local/folder",
-    repo_id="username/your-model-name",
-    token="your-token",
-)
-```
-
-The uploaded example can be found in the [Model](https://modelers.cn/models/MindSpore-Lab/internlm2-7b/tree/main) of the Modelers community.
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_en/function/dataset.md b/docs/mindformers/docs/source_en/function/dataset.md
deleted file mode 100644
index eeeb2f1c4739cdc26aec8df33a57c3249efdd4f1..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_en/function/dataset.md
+++ /dev/null
@@ -1,783 +0,0 @@
-# Dataset
-
-[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_en/function/dataset.md)
-
-At present, MindSpore Transformers' pre-training and fine-tuning support the ability to load datasets in multiple formats, including loading methods for Megatron Dataset, MindRecord Dataset, and HuggingFace datasets. The specific usage instructions for each format of dataset are as follows.
-
-## Megatron Dataset
-
-Megatron Dataset refers to a dataset collected from multiple different sources, it contains different text types, formats, and domains. Using dataset can help models learn a wider range of language features and knowledge, thereby improving their generalization ability and performance. The current implementation of the Megatron framework requires preprocessing the original dataset into a BIN format dataset. MindSpore Transformers have been natively adapted to the Megatron Dataset, providing scripts for creating BIN format datasets and supporting direct use of the Megatron Dataset in training tasks.
-
-### How to Make a BIN Format Dataset
-
-MindSpore Transformers provides a preprocessing script [mindformers/tools/dataset_preprocess/preprocess_indexed_dataset.py](https://gitee.com/mindspore/mindformers/blob/r1.5.0/mindformers/tools/dataset_preprocess/preprocess_indexed_dataset.py), which can convert text data to a BIN format dataset. This script currently only supports processing files in a specific JSON format. Users need to first convert the original dataset file into a specific JSON format file, and then use a preprocessing script to generate a BIN format dataset file. Some models in MindSpore Transformers currently provide scripts for converting specific open-source datasets into JSON format files. If users want to use their own datasets, they need to write their own scripts to convert them into the desired format.
-
-The format of the required JSON format file content is as follows:
-
-```json
-{"id": "0", "text": "The quick brown fox", "type": "Eng", "src": "www.nvidia.com", "title": "First Part"}
-{"id": "1", "text": "jumps over the lazy dog", "type": "Eng", "src": "The Internet", "title": "Second Part"}
-...
-```
-
-Each piece of data consists of several key value pairs, and the supported keys and descriptions are as follows:
-
-- `"id"`: The numbering of the data should be in order, required
-- `"text"`: Text data actually used for training, required
-- `"type"`: Indicate language type, optional
-- `"src"`: Indicate the source of the data, optional
-- `"title"`: Indicate the title of the data, optional
-
-Taking the processing of Wiki datasets and their use as pre-training for Llama2 models as an example, the detailed steps for creating BIN format datasets are explained below:
-
-1. Download Wiki Dataset
-
-   For the original Wiki Dataset downloading, refer to [Llama2 Dataset Download](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/llama2.md#%E6%95%B0%E6%8D%AE%E5%8F%8A%E6%9D%83%E9%87%8D%E5%87%86%E5%A4%87).
-
-2. Generate JSON Format File
-
-   The original format of the Wiki Dataset is as follows:
-
-   ![](image/wikitext_sample.png)
-
-   The format of the JSON file `wiki.json` after processing the Wiki Dataset is as follows (omitting long text):
-
-   ```json
-   {"id": 0, "text": "The gold dollar or gold one ..."}
-   {"id": 1, "text": "Super Mario Land is a 1989 ..."}
-   {"id": 2, "text": "The Sinclair Scientific Programmable ..."}
-   ...
-   ```
-
-3. Download The Vocabulary File For Llama2
-
-   In the preprocessing script, the raw text data will be processed into Tokens using the Tokenizer of the model, therefore, it is necessary to download the vocabulary file in advance.
-
-   Download link for Llama2 vocabulary file: [tokenizer.model](https://ascend-repo-modelzoo.obs.cn-east-2.myhuaweicloud.com/MindFormers/llama2/tokenizer.model)
-
-4. Generate BIN Format Files Using Preprocessing Scripts
-
-    After processing into the specific JSON format file mentioned above, using [mindformers/tools/dataset_preprocess/preprocess_indexed_dataset.py](https://gitee.com/mindspore/mindformers/blob/r1.5.0/mindformers/tools/dataset_preprocess/preprocess_indexed_dataset.py) to convert it into a BIN format dataset, the specific command is as follows:
-
-    ```shell
-    python mindformers/tools/dataset_preprocess/preprocess_indexed_dataset.py \
-    --input ./wiki.json \
-    --output-prefix wiki_processed_1024 \
-    --tokenizer-type LlamaTokenizer \
-    --vocab-file ./tokenizer.model \
-    --add_bos_token True \
-    --add_eos_token True \
-    --pad_or_stitch stitch \
-    --seq-length 1024 \
-    --workers 1
-    ```
-
-    Configuration parameter description:
-
-    - `--input`: Path to JSON format file
-    - `--output-prefix`: The file name prefix of the preprocessed output file
-    - `--tokenizer-type`: The type of tokenizer corresponding to the model
-    - `--vocab-file`: The path of the vocabulary file for the tokenizer model tokenizer
-    - `--add_bos_token`: Add bos_token at the beginning of the data, Default: False
-    - `--add_eos_token`: Add eos_token at the ending of the data, Default: False
-    - `--pad_or_stitch`: According to the requirements of the training task, set whether to splice or fill in, pad is in fill in mode, this mode will fill in the data with insufficient length to the seq length; Stitch is a concatenation mode that concatenates multiple pieces of data into data with a length of seq length
-    - `--seq-length`: Preprocess the length of each piece of data
-    - `--workers`: The number of parallel workers during preprocessing
-
-After executing the above command, two files will be obtained, in `.bin` and `.idx` formats respectively. The `.bin` format file stores the actual data, and `.idx` format file stores the index of each piece of data.
-
-### Using Megatron Datasets in Training Tasks
-
-Use the Megatron multi-source dataset in the training task as follows:
-
-1. Prepare the `parallel_speed_up.json` file
-
-   `parallel_speed_up.json` is a dataset parallel communication configuration file, and the file content is as follows:
-
-   ```json
-   {
-       "dataset_broadcast_opt_level": 3
-   }
-   ```
-
-2. Set environment variables
-
-    Enter the following command at the command line to set environment variables:
-
-    ```shell
-    export MS_DEV_DYNAMIC_SINK1=False
-    ```
-
-3. Modify YAML configuration files for training tasks
-
-    Configure the relevant parameters of Megatron Dataset in YAML configuration file. Here, taking the Llama2-7B model pre-training task as an example, modify `train_dataset` , `runner_config` , `parallel_config` , `parallel` and `context` in  [pretrain_llama2_7b.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/configs/llama2/pretrain_llama2_7b.yaml#L39). The specific modifications and explanations are as follows:
-
-    ```yaml
-    train_dataset: &train_dataset
-      data_loader:
-        type: BlendedMegatronDatasetDataLoader
-        datasets_type: "GPTDataset"
-        sizes:
-          - 1000
-          - 0
-          - 0
-        shuffle: False
-        config:
-          seed: 1234
-          seq_length: 1024
-          split: "1, 0, 0"
-          data_path:
-            - 0.3
-            - "/path/to/my_wiki_test_1024_text_document"
-            - 0.7
-            - "/path/to/my_wiki_test_1024_text_document"
-          num_dataset_builder_threads: 1
-          eod_mask_loss: False
-          create_attention_mask: False
-      input_columns: ["input_ids", "labels", "loss_mask", "position_ids"]
-    ```
-
-    Among them:
-
-    - data_loader.type: The type of dataloader, should be set to `BlendedMegatronDatasetDataLoader`.
-    - data_loader.datasets_type: Dataset type, currently only supports `GPTDataset`.
-    - data_loader.sizes: `- 1000` , `- 0` , `- 0` are the sampling sizes for the training set, test set, and validation set, respectively. Currently, only the training set can be configured.
-    - input_columns: Set the input data columns for the training dataset, typically configured as `["input_ids", "labels", "loss_mask", "position_ids"]` .
-    - data_loader.config.seed: Random number seed when creating a dataset. Default: `1234` .
-    - data_loader.config.seq_length: The length of each piece of data must be consistent with the model.model_config.seq_length in the YAML configuration.
-    - data_loader.config.split: Split string, separate the weights of the training set, test set, and validation set with commas, used to split the dataset when drawing samples from a single distribution. Currently, only supports configuration as `"1, 0, 0"` .
-    - data_loader.config.data_path: The number is the weight of each dataset, and the string is the path of the dataset BIN file, which needs to remove the file format suffix `.bin` .
-    - data_loader.config.num_dataset_builder_threads: The number of processes used when creating the dataset. Default: `1` .
-    - data_loader.config.eod_mask_loss: Do you want to use the switch of eod mask. Default: `False` .
-    - data_loader.config.create_attention_mask: Whether to construct attention_mask. Default: `True` .
-
-    There are still limitations to the current Megatron Dataset, which only supports non full batch scenarios, and it does not support the parallel feature of seq_pipe. The corresponding configuration items need to be modified according to the following:
-
-    ```yaml
-    runner_config:
-        sink_mode: True
-        sink_size: 1
-
-    parallel_config:
-        data_parallel: &dp 2
-        model_parallel: 2
-        pipeline_stage: 1
-
-    parallel:
-        full_batch: False
-        dataset_strategy: [[*dp, 1], [*dp, 1], [*dp, 1], [*dp, 1]]
-
-    context:
-        ascend_config:
-            parallel_speed_up_json_path: "/path/to/parallel_speed_up.json"
-    ```
-
-    The configuration instructions that need to be noted are as follows:
-
-    - parallel.dataset_strategy: Only support List of List type, parallel.dataset_strategy: Only support List of List type. The number of sub lists in a List needs to be equal to the length of train_dataset.input_columns, and each sub List in the List needs to be consistent with the shape of the data returned by the dataset. Generally, parallel data partitioning is performed in the first dimension of the data, so the first bit of the sub List is configured as `*dp` , and the other bits are configured as `1` . The specific principle can be referred to [Dataset Segmentation](https://www.mindspore.cn/tutorials/en/r2.6.0/parallel/dataset_slice.html).
-
-4. Compile Megatron Dataset module
-
-    MindSpore Transformers have built-in Megatron Dataset module code, before starting the training task, the following command needs to be executed for compilation:
-
-    ```shell
-    pip install pybind11
-    cd mindformers/dataset/blended_datasets
-    make
-    ```
-
-## MindRecord Dataset
-
-MindRecord is an efficient data format developed by MindSpore for storing machine learning or deep learning datasets.
-
-The MindRecord format is designed to improve data processing efficiency, especially in large-scale data training scenarios where data can be loaded and processed faster.
-MindRecord files typically contain the input samples needed for model training, which are preprocessed (e.g., encoded, normalized) to optimize read speed and memory usage.
-
-For more information about the implementation of MindRecord related interfaces and examples, please refer to the [documentation about MindRecord in MindSpore](https://www.mindspore.cn/docs/en/r2.6.0/api_python/mindspore.mindrecord.html).
-
-### How to Make a MindRecord Dataset
-
-The MindRecord module provides methods to convert different datasets into MindRecord format.
-You can use the FileWriter interface provided by MindSpore to generate MindRecord format datasets.
-
-The following is an example of a MindRecord dataset based on a json format file, taking Llama2 as an example:
-
-1. Prepara json file
-
-   Prepare a json file like this, named `mydata.json`:
-
-   ```json
-   [
-      {
-        "text": "I love Beijing, because it is a city that beautifully blends rich history with modern vibrancy."
-      },
-      {
-        "text": "I love Hangzhou, because it is a city that seamlessly combines natural beauty with rich cultural heritage."
-      }
-   ]
-   ```
-
-2. Read json file
-
-   ```python
-   import json
-
-   raw_data = None
-   file = open("mydata.json", "r")  # Open json file
-   if file is not None:
-      raw_data = json.load(file)  # Read json file into raw_data
-      file.close()
-   ```
-
-3. Define a MindRecord ``schema`` and create a ``FileWriter`` object;
-
-    ```python
-    from mindspore.mindrecord import FileWriter
-
-    # Define a schema for MindRecord
-    schema = {'input_ids': {"type": "int32", "shape": [-1]}
-    # Create a FileWriter object
-    writer = FileWriter(file_name="output_file", shard_num=1)
-    writer.add_schema(schema, "dataset_type")
-    ```
-
-4. Iterate through each piece of data in the processed json file, convert it to MindRecord format, and write it to a MindRecord file.
-
-   Word list download link: [tokenizer.model](https://ascend-repo-modelzoo.obs.cn-east-2.myhuaweicloud.com/MindFormers/llama2/tokenizer.model)
-
-    ```python
-    import numpy as np
-    from mindformers import LlamaTokenizer
-
-    def tokenize_json(tokenizer, raw_data):
-        """tokenize json file dataset"""
-        content = [] # Read each json data and get its “input_ids”.
-        for line in raw_data:
-            stripped_line = line['text'].strip()
-            if stripped_line:
-                line_ids = tokenizer(stripped_line)["input_ids"]
-                content.append(line_ids)
-
-        for ids in content:
-            sample = {}
-            sample['input_ids'] = np.array(ids, dtype=np.int32)
-            yield sample
-
-    # Tokenize the text data
-    word_tokenizer = LlamaTokenizer(vocab_file=r"tokenizer.model")
-
-    # Iterate through each piece of data in the processed json file, convert it to MindRecord format and write it to the MindRecord file
-    # tokenize_json is a custom method to tokenize the dialog data in json.
-    for x in tokenize_json(word_tokenizer, raw_data):
-        writer.write_raw_data([x])
-    writer.commit()
-    ```
-
-For the detailed cases, refer to [Examples of Data Preprocessing in Llama2](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/llama2.md#%E6%95%B0%E6%8D%AE%E5%8F%8A%E6%9D%83%E9%87%8D%E5%87%86%E5%A4%87).
-
-### Using MindRecord Format Datasets in Tasks
-
-You can make a training or evaluation task use a prepared MindRecord format dataset by configuring dataset-related parameters in the yaml configuration file.
-
-Here, as an example, for the Llama2-7B model pretraining task, the default configuration parameters and descriptions in the [pretrain_llama2_7b.yaml file](https://gitee.com/mindspore/mindformers/blob/r1.5.0/configs/llama2/pretrain_llama2_7b.yaml#L39) are as follows:
-
-```yaml
-# dataset
-train_dataset: &train_dataset
-  data_loader:
-    type: MindDataset
-    dataset_dir: ""
-    shuffle: True
-  input_columns: ["input_ids"]
-  num_parallel_workers: 8
-  python_multiprocessing: False
-  drop_remainder: True
-  batch_size: 6
-  repeat: 1
-  numa_enable: False
-  prefetch_size: 1
-
-train_dataset_task:
-  type: CausalLanguageModelDataset
-  dataset_config: *train_dataset
-```
-
-Configure the following parameters to use MindRecord format datasets:
-
-- data_loader.type: The type of the dataloader, which needs to be set to `MindDataset`.
-- data_loader.dataset_dir: The path to the dataset file.
-- input_columns: Sets the data columns for the input of the training dataset. Currently a pre-training scenario, set to `["input_ids"]`.
-
-The rest of the parameters can be described in "model training configuration" and "model evaluation configuration [Configuration File Description](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/appendix/conf_files.html).
-
-## HuggingFace Datasets
-
-Currently, the dataset loading functionality has been integrated with the [ModelScope Open-Source Community](https://modelers.cn/datasets) and the [HuggingFace Community](https://huggingface.co/datasets), supporting online dataset loading and preprocessing. Additionally, datasets can be [packed](#dataset-packing) to enhance model training efficiency.
-
-### Usage Instructions
-
-HuggingFace datasets support online and offline loading of datasets from both the HuggingFace community and the MoLo open-source community. Below is an introduction to environment preparation, the dataset loading process, and how to configure the use of HuggingFace datasets in configuration files.
-
-#### Integrating with Open-Source Communities
-
-- Integrating with HuggingFace Community
-
-   To use datasets from the HuggingFace community, follow these steps:
-
-  1. Environment Setup
-
-     The environment variable `HF_ENDPOINT` controls the remote repository used by HuggingFace. By default, it is set to `https://huggingFace.co`.
-     For users in China, it is recommended to configure it to the mirror address ```export HF_ENDPOINT=https://hf-mirror.com``` .
-
-  2. Install Dependencies
-
-     ```shell
-     pip install datasets
-     ```
-
-- Integrating with ModelScope Open-Source Community
-
-   To use datasets from the ModelScope Open-Source Community, follow these steps:
-
-   1. Environment Setup
-
-      The environment variable `OPENMIND_HUB_ENDPOINT` controls the remote repository used by the ModelScope Open-Source Community.
-      Defaults to ```export OPENMIND_HUB_ENDPOINT=https://telecom.openmind.cn``` when not configured.
-
-   2. Install Dependencies
-
-      ```shell
-      git clone https://gitee.com/openmind-ai/openmind-hub.git
-      cd openmind-hub
-      pip install -e .
-      cd ..
-      git clone https://gitee.com/foundation-models/openmind-datasets.git
-      cd openmind-datasets
-      pip install -e .
-      cd ..
-      ```
-
-> When the openmind-datasets component is installed in the environment, the default interface is the Modelers open source community, if you want to interface with HuggingFace, the environment variable `USE_OM` can control which community to interface with, the default value is `ON` for the Modelers community, change it to `OFF` to interface with the HuggingFace community.
-
-#### Dataset Loading Process
-
-![commondataloader.png](../../source_zh_cn/function/image/commondataloader.png)
-
-The online dataset loading and processing functionality is primarily implemented through `CommonDataLoader`. The data loading part can be customized via configuration files, with detailed configuration instructions available in the [dataloader parameter description](#dataloader-parameter-description). The online loading module requires users to implement customizations for different datasets. For example, the `AlpacaInstructDataHandler` class can be used to preprocess the `alpaca` dataset. For more information, please refer to [Custom Data Handler](#custom-data-handler).
-
-#### dataloader Parameter Description
-
-The online dataset loading feature is enabled by configuring the `data_loader` in the configuration file. Below is an example configuration for online dataset loading:
-
-```yaml
-train_dataset: &train_dataset
-  input_columns: &input_columns ["input_ids", "labels", "loss_mask", "position_ids", "attention_mask"]
-  construct_args_key: *input_columns
-  data_loader:
-    type: CommonDataLoader
-    load_func: 'load_dataset'
-    shuffle: False
-    split: "train"
-    path: "llm-wizard/alpaca-gpt4-data"
-    packing: pack
-    handler:
-      - type: AlpacaInstructDataHandler
-        tokenizer_name: llama2_7b
-        seq_length: 4096
-        prompt_key: "conversations"
-        output_columns: ["input_ids", "labels"]
-        is_dynamic: False
-      - type: PackingHandler
-        seq_length: 4096
-        output_columns: ["input_ids", "labels", "actual_seq_len"]
-    adaptor_config:
-      compress_mask: False
-    column_names: *input_columns
-```
-
-Parameter descriptions for `data_loader` are as follows:
-
-| Parameter Name | Description                                                                                                                                                                                                                            | Type |
-|----------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:----:|
-| type           | Fixed as `CommonDataLoader`. This module supports loading datasets from HuggingFace and the ModelScope open-source community.                                                                                                          | str  |
-| packing        | Packing configuration when processing datasets with `handler`. Options include `pack` and `truncate`.                                                                                                                                  | str  |
-| load_func      | The function used to load datasets. Options are `load_dataset` and `load_from_disk`. Use `load_from_disk` for data saved via the `save_to_disk` function, and `load_dataset` for other scenarios. The default value is `load_dataset`. | str  |
-| path           | When `load_func=load_dataset`, this parameter aligns with the interface in [datasets.load_dataset](https://huggingface.co/docs/datasets/loading). When `load_func=load_from_disk`, it specifies the dataset loading path.              | str  |
-| data_files     | When `load_func=load_dataset`, this parameter aligns with the interface in [datasets.load_dataset](https://huggingface.co/docs/datasets/loading). It is ineffective when `load_func=load_from_disk`.                                   | str  |
-| handler        | Multiple `handlers` can be configured to preprocess the loaded dataset in the order specified. For details on `handler` configuration, refer to the handler parameter description in [Custom Data Handler](#custom-data-handler).      | list |
-| adaptor_config | Dataset-related configuration during model training. Currently supports `compress_mask`, effective when `packing` is set. If enabled, it returns a compressed data mask. Default is `False`.                                           | dict |
-| shuffle        | Indicates whether random sampling is enabled when loading the dataset.                                                                                                                                                                 | bool |
-| column_names   | Specifies the column names returned by the dataset. If not set, all columns are returned.                                                                                                                                              | list |
-| is_dynamic     | Indicates whether the dataset returns dynamic-length data. Default is `False`.                                                                                                                                                         | bool |
-
-> In addition to the above configurations, all parameters from the [datasets.load_dataset](https://huggingface.co/docs/datasets/loading) interface are supported with the same meanings and functions.
-
-When packing is configured, the dataset returns an `actual_seq_len` column. For more information, refer to the `actual_seq_qlen` and `actual_seq_kvlen` parameter descriptions in the [documentation](https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0027.html).
-
-### Feature Introduction
-
-#### Dynamic Sequence Length Fine-Tuning
-
-`CommonDataLoader` supports dynamic shape fine-tuning using HuggingFace datasets, which can be loaded online or offline. Below, we use the `alpaca` dataset as an example to demonstrate the configuration for dynamic shape fine-tuning.
-
-- Online Loading
-
-  The online dataset name is `llm-wizard/alpaca-gpt4-data`. You can search and download it from the [HuggingFace official website](https://huggingface.co/datasets) or load it directly using the online name.
-
-  Example configuration for online loading:
-
-  ```yaml
-  train_dataset: &train_dataset
-    input_columns: &input_columns ["input_ids", "labels"]
-    dynamic_batch: True                    # Enable dynamic shape
-    divisor: 32                            # With divisor and remainder configured, seq_length in dynamic shape will become a multiple of divisor and the sum of remainder
-    remainder: 1
-    data_loader:
-      type: CommonDataLoader
-      shuffle: True
-      split: "train"                       # Subset name of the online dataset
-      path: "llm-wizard/alpaca-gpt4-data"  # Online dataset name
-      handler:
-        - type: AlpacaInstructDataHandler
-          tokenizer_name: llama2_7b
-          seq_length: 4096
-          prompt_key: "conversations"
-          output_columns: *input_columns
-          is_dynamic: True
-    seed: 0
-    num_parallel_workers: 8
-    python_multiprocessing: False
-    drop_remainder: True
-    repeat: 1
-    numa_enable: False
-    prefetch_size: 1
-  ```
-
-  1. For parameter descriptions in `train_dataset`, please refer to the [documentation](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/appendix/conf_files.html).
-
-  2. `AlpacaInstructDataHandler` is an online processing script developed for the `alpaca` dataset. If using a different dataset, you need to implement a custom data handler by referring to the [Custom Data Handler](#custom-data-handler) guide.
-
-- Offline Loading
-
-  For offline loading, you need to prepare the JSON files of the `alpaca` dataset. The offline configuration differs from the online configuration only in the following parameters:
-
-  ```yaml
-   train_dataset:
-     data_loader:
-       path: "json"                               # loading datasets using the load_dataset interface
-       data_files: '/path/alpaca_gpt4_data.json'  # the file path of the alpaca dataset
-   ```
-
-After configuring the dataset loading method, you also need to set `is_dynamic=True` in the model configuration to enable dynamic shape training for the model.
-
-```yaml
-model_config:
-  is_dynamic: True
-```
-
-Since dynamic shapes may lead to operator compilation caching, it is recommended to set the following environment variables to limit the number of cached compilations when running in a memory-constrained environment. This helps prevent out-of-memory issues:
-
-```shell
-export ACLNN_CACHE_LIMIT=10
-export MS_DEV_RUNTIME_CONF="aclnn_cache_queue_length:64"
-```
-
-- The `ACLNN_CACHE_LIMIT` parameter description can be found in the [documentation](https://www.hiascend.com/document/detail/zh/canncommercial/800/apiref/envvar/envref_07_0031.html).
-- `MS_DEV_RUNTIME_CONF` is a parameter in MindSpore for setting the operator cache queue length. The value `64` represents the length of the sequence, which defaults to `1024`. This can be adjusted based on the actual environment. Setting the value too small may affect model training performance.
-
-After completing all the configurations above, you can proceed with dynamic shape fine-tuning by referring to the documentation for the specific model you are using.
-
-#### Custom Data Handler
-
-Users can define custom data handlers to apply various preprocessing logic to the loaded dataset.
-
-- Handler Parameter Description
-
-  | Parameter Name | Description                                                                                                                           |   Type   |
-  |----------------|---------------------------------------------------------------------------------------------------------------------------------------|:--------:|
-  | type           | Custom data handler name. A custom handler must inherit from `BaseInstructDataHandler`.                                               |   str    |
-  | tokenizer_name | Name of the tokenizer used.                                                                                                           |   str    |
-  | tokenizer      | Tokenizer configuration parameters. Can be a dictionary, string, or a `tokenizer` object. Takes lower priority than `tokenizer_name`. | dict/str |
-  | seq_length     | Maximum sequence length, usually the same as the model's sequence length.                                                             |   int    |
-  | output_columns | Column names of the processed data returned after preprocessing.                                                                      |   list   |
-  | prompt_key     | Column name for data after applying prompt processing.                                                                                |   str    |
-
-- Development Sample 1
-
-  The custom data handler is usually placed in the `mindformers/dataset/handler` directory, and the customized one needs to inherit the abstract base class ``BaseInstructDataHandler``.
-  You need to implement ``format_func`` and ``tokenize_func`` methods, which preprocess each data loaded. Refer to ``alpaca_handler.py``.
-
-  ```python
-  @MindFormerRegister.register(MindFormerModuleType.DATA_HANDLER)
-  class XXXInstructDataHandler(BaseInstructDataHandler):
-
-      def format_func(self, example):
-          # Custom data format conversion
-
-      def tokenize_func(self, example):
-          # Custom tokenizer split word processing
-  ```
-
-  The ``BaseInstructDataHandler`` provides an implementation of the entry ``handler`` method by default, which is used to iterate over each piece of data for data preprocessing.
-  The ``format_func`` is used to implement how to convert the raw data into the desired data format, and the ``tokenize_func`` method is used to take the processed data and perform a customized tokenization.
-  The input parameter ``example`` in the example is each of the samples obtained.
-
-- Development Sample 2
-
-  If you want to process the data directly for the whole dataset instead of processing each piece of data in batches, you can implement the entry ``handle`` method in custom handler, and you will get the complete dataset, as shown below:
-
-  ```python
-      def handle(self, dataset):
-          """data handler"""
-          return dataset.rename_columns({"content":"prompt","summary":"answer"})
-  ```
-
-- alpaca Dataset Sample
-
-  Modify the task configuration file [finetune_llama2_7b.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/configs/llama2/finetune_llama2_7b.yaml).
-
-  Modify the following parameters:
-
-  ```yaml
-  train_dataset: &train_dataset
-    input_columns: &input_columns ["input_ids", "labels"]
-    data_loader:
-      type: CommonDataLoader
-      shuffle: True
-      split: "train"
-      path: "llm-wizard/alpaca-gpt4-data"
-      handler:
-        - type: AlpacaInstructDataHandler
-          tokenizer_name: llama2_7b
-          seq_length: 4096
-          prompt_key: "conversations"
-          output_columns: *input_columns
-    seed: 0
-    num_parallel_workers: 8
-    python_multiprocessing: False
-    drop_remainder: True
-    repeat: 1
-    numa_enable: False
-    prefetch_size: 1
-  ```
-
-  The rest of the parameters can be described in "model training configuration" and "model evaluation configuration [Configuration File Description](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/appendix/conf_files.html).
-
-  Custom data handler:
-
-  ```python
-  @MindFormerRegister.register(MindFormerModuleType.DATA_HANDLER)
-  class AlpacaInstructDataHandler(BaseInstructDataHandler):
-
-      def format_func(self, example):
-          """format func"""
-          source = PROMPT_INPUT.format_map(example) \
-              if example.get(self.input_key, "") != "" \
-              else PROMPT_NO_INPUT.format_map(example)
-          target = example.get(self.output_key)
-          formatted_example = [
-              {
-                  "from": self.user_role,
-                  "value": source,
-              },
-              {
-                  "from": self.assistant_role,
-                  "value": target,
-              },
-          ]
-
-          return formatted_example
-
-      def tokenize_func(self, messages):
-          """tokenize func"""
-          conversation = self.gen_prompt(messages)
-          sep = self.template.sep + self.assistant_role + ": "
-          # Tokenize conversations
-          rounds = conversation.split(self.template.sep2)
-          ids = [self.tokenizer.bos_token_id]
-          mask = [1]
-          for _, rou in enumerate(rounds):
-              if rou == "":
-                  break
-              conv_out = self.tokenizer(rou)
-              ids.extend(conv_out['input_ids'][1:])
-              mask.extend(conv_out['attention_mask'][1:])
-          d = {'input_ids': ids, 'attention_mask': mask}
-          # pylint: disable=W0212
-          if not self.dynamic:
-              d = self.tokenizer._pad(d, max_length=self.seq_length + 1, padding_strategy='max_length')
-          input_id = d['input_ids'][:self.seq_length + 1]
-          target = np.array(d['input_ids'])
-          total_len = int(np.not_equal(target, self.tokenizer.pad_token_id).sum())
-          cur_len = 1
-          target[:cur_len] = self.ignore_token_id
-          for _, rou in enumerate(rounds):
-              if rou == "":
-                  break
-              parts = rou.split(sep)
-              if len(parts) != 2:
-                  break
-              parts[0] += sep
-              round_len = len(self.tokenizer(rou)['input_ids']) - 1
-              instruction_len = len(self.tokenizer(parts[0])['input_ids']) - 3
-
-              target[cur_len: cur_len + instruction_len] = self.ignore_token_id
-
-              cur_len += round_len
-          if self.dynamic:
-              return {
-                  "input_ids": input_id,
-                  "labels": target[:len(input_id)].tolist()
-              }
-          target[cur_len:] = self.ignore_token_id
-          if cur_len < self.seq_length + 1:
-              if cur_len != total_len:
-                  target[:] = self.ignore_token_id
-          else:
-              target = target[:self.seq_length + 1]
-          label = target.tolist()
-          return {
-              "input_ids": input_id,
-              "labels": label,
-          }
-  ```
-
-- ADGEN Dataset Sample
-
-  Modify the task configuration file [run_glm3_6b_finetune_2k_800T_A2_64G.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/configs/glm3/run_glm3_6b_finetune_2k_800T_A2_64G.yaml).
-
-  Modify the following parameters:
-
-  ```yaml
-  train_dataset: &train_dataset
-    data_loader:
-      type: CommonDataLoader
-      path: "HasturOfficial/adgen"
-      split: "train"
-      shuffle: True
-      handler:
-        - type: AdgenInstructDataHandler
-      phase: "train"
-      version: 3
-      column_names: ["prompt", "answer"]
-    tokenizer:
-      type: ChatGLM3Tokenizer
-      vocab_file: "/path/to/tokenizer.model"
-    input_columns: ["input_ids", "labels"]
-    max_source_length: 1024
-    max_target_length: 1023
-    ignore_pad_token_for_loss: True
-    num_parallel_workers: 8
-    python_multiprocessing: False
-    drop_remainder: True
-    batch_size: 8
-    repeat: 1
-    numa_enable: False
-    prefetch_size: 1
-    seed: 0
-  ```
-
-  The rest of the parameters can be described in "model training configuration" and "model evaluation configuration [Configuration File Description](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/appendix/conf_files.html).
-
-  Custom adgen_handler:
-
-  ```python
-  @MindFormerRegister.register(MindFormerModuleType.DATA_HANDLER)
-  class AdgenInstructDataHandler(BaseInstructDataHandler):
-      """agden data handler"""
-      def handle(self, dataset):
-          """data handler"""
-          return dataset.rename_columns({"content": "prompt", "summary": "answer"})
-  ```
-
-#### Dataset Packing
-
-Configuring `PackingHandler` in `CommonDataLoader` allows for packing processing of the data. Currently, the original data needs to be processed into `input_ids` and `labels` that can be fed into the model during the preprocessing step.
-
-- Parameter Description
-
-  | Parameter Name | Description                                                                                                                                                                                                                                                  | Type |
-  |----------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:----:|
-  | type           | Fixed as `PackingHandler`. This module supports packing data. When `packing=pack` or `packing=truncate` is configured in [dataloader](#dataloader-parameter-description), it performs non-truncating and truncating concatenation of the data, respectively. | str  |
-  | seq_length     | Maximum sequence length of the data after packing.                                                                                                                                                                                                           | int  |
-  | pad_token      | Token ID used for padding `input_ids` when the packed sample does not reach the maximum length. Default value is 0.                                                                                                                                          | int  |
-  | ignore_token   | Token ID used for padding `labels` when the packed sample does not reach the maximum length. Default value is -100.                                                                                                                                          | int  |
-
-- Packing Example
-
-  By following the configuration below, the `alpaca` dataset can be preprocessed to achieve online packing.
-
-  ```yaml
-  train_dataset: &train_dataset
-    input_columns: &input_columns ["input_ids", "labels", "loss_mask", "position_ids", "attention_mask"]
-    construct_args_key: *input_columns
-    data_loader:
-      type: CommonDataLoader
-      shuffle: False
-      split: "train"
-      path: "llm-wizard/alpaca-gpt4-data"
-      packing: pack
-      handler:
-        - type: AlpacaInstructDataHandler
-          tokenizer_name: llama2_7b
-          seq_length: 4096
-          prompt_key: "conversations"
-          output_columns: ["input_ids", "labels"]
-        - type: PackingHandler
-          seq_length: 4096
-          output_columns: ["input_ids", "labels", "actual_seq_len"]
-      adaptor_config:
-        compress_mask: False
-    seed: 0
-    num_parallel_workers: 8
-    python_multiprocessing: False
-    drop_remainder: True
-    repeat: 1
-    numa_enable: False
-    prefetch_size: 1
-  ```
-
-Using the above configuration file to process the `alpaca` dataset will execute the following steps:
-
-1. The raw text data will be processed into `input_ids` and `labels` using `AlpacaInstructDataHandler` and the `tokenizer` of `llama2_7b`.
-2. `PackingHandler` will be used to perform packing on the processed `input_ids` and `labels`, resulting in concatenated `input_ids` and `labels` up to the `seq_length`. The `actual_seq_len` refers to the sequence length of each sub-sample in the concatenated sample. During training, this parameter will be used to generate the corresponding data mask.
-3. If `compress_mask=False` is set in `adaptor_config`, a complete data mask will be returned during training. Otherwise, `actual_seq_len` will be returned.
-
-#### Offline Dataset Processing
-
-In addition to supporting online dataset loading and processing, `CommonDataLoader` also supports offline dataset processing and saving.
-
-The [datasets_preprocess.py](https://gitee.com/mindspore/mindformers/blob/r1.5.0/toolkit/data_preprocess/huggingface/datasets_preprocess.py) script can be used to process Huggingface datasets offline and save them.
-
-- Parameter Description
-
-  | Parameter Name | Description                                                                                                                                                               | Type |
-  |----------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:----:|
-  | config         | Configuration file for offline data processing, which is used in the same way as online processing. Refer to [dataloader](#dataloader-parameter-description) for details. | str  |
-  | save_path      | Path where the preprocessed dataset will be saved.                                                                                                                        | str  |
-  | register_path  | Registration path for the model API, which includes the Python files related to the model, typically the model folder under the `research` directory.                     | int  |
-
-- Usage Example
-
-  You can use the configuration file provided in the [dataset packing](#dataset-packing) example and execute the following command.
-
-  ```shell
-  python toolkit/data_preprocess/huggingface/datasets_preprocess.py \
-    --config data_process.yaml \
-    --save_path /path/processed_data
-  ```
-
-  If you need to load the saved dataset, you should modify the YAML configuration as follows:
-
-  ```yaml
-  train_dataset: &train_dataset
-    input_columns: &input_columns ["input_ids", "labels", "loss_mask", "position_ids", "attention_mask"]
-    construct_args_key: *input_columns
-    data_loader:
-      type: CommonDataLoader
-      shuffle: False
-      load_func: "load_from_disk"
-      path: "/path/processed_data"
-      adaptor_config:
-        compress_mask: False
-  ```
diff --git a/docs/mindformers/docs/source_en/function/distributed_parallel.md b/docs/mindformers/docs/source_en/function/distributed_parallel.md
deleted file mode 100644
index 56f5544310cb3e84d5a9350d55cf0d8f768958d5..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_en/function/distributed_parallel.md
+++ /dev/null
@@ -1,171 +0,0 @@
-# Distributed Parallelism
-
-[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_en/function/distributed_parallel.md)
-
-## Parallel Modes and Application Scenarios
-
-Large-scale deep learning model training requires robust computing power, especially in the case of a large dataset and a complex model architecture. As such, a single device usually cannot meet this requirement. To solve this problem, MindSpore provides a set of powerful parallelism strategies for configuration. You can use flexible parallelism strategies to greatly improve training efficiency and reduce computing resource consumption.
-
-MindSpore offers parallel modes including data parallelism, model parallelism, pipeline parallelism, and sequence parallelism. They can be used independently or combined as a hybrid parallelism strategy to meet different model training requirements. By adopting proper parallelism strategies, you can leverage the computing resources of multiple devices, significantly improving the training efficiency.
-
-In actual applications, different parallelism strategies apply to different scenarios.
-
-- **Data parallelism**: applies to a simple model with a lot of data.
-- **Model parallelism**: applies to a model with a huge number of parameters that a single device cannot accommodate.
-- **Pipeline parallelism**: applies to ultra-large-scale model training that requires multi-device computing.
-- **Sequence parallelism**: applies to a model with input of long sequences, reducing the GPU memory usage of a single device.
-- **Multi-copy parallelism**: uses sequential scheduling algorithm to control the parallelism of fine-grained multi-branch operations, improving the overlap of computing and communications.
-- **Optimizer parallelism**: distributes computing tasks of optimizers to multiple devices to reduce memory usage and improve training efficiency.
-
-> The parallelism strategy configuration in the YAML file provided by the repository has been optimized. Currently, you are recommended to use semi-automatic parallelism for optimal performance and stability.
-
-## Parallelism Features Supported by MindSpore Transformers
-
-MindSpore Transformers supports multiple parallelism features. You can use these features to optimize the training of different model architectures and hardware configurations. The following table outlines these parallelism features and provides links to the details in the MindSpore documentation.
-
-| **Parallelism Feature**                     | **Description**                                                                         |
-|-----------------------------------|---------------------------------------------------------------------------------|
-| **[Data parallelism](https://www.mindspore.cn/tutorials/en/r2.6.0/parallel/data_parallel.html)**                    | Splits data to multiple devices and trains the data on each device at the same time. This mode applies to training a simple model with a lot of data.                                   |
-| **[Model parallelism](https://www.mindspore.cn/tutorials/en/r2.6.0/parallel/operator_parallel.html)**                    | Distributes model parameters to multiple devices. This mode applies to the scenario where a single device cannot accommodate the entire model.                                               |
-| **[Pipeline parallelism](https://www.mindspore.cn/tutorials/en/r2.6.0/parallel/pipeline_parallel.html)**                  | Divides an ultra-large model into multiple phases with each running on different devices for efficient training.                                       |
-| **[Optimizer parallelism](https://www.mindspore.cn/tutorials/en/r2.6.0/parallel/optimizer_parallel.html)**                  | Distributes the optimizer computation to multiple devices to reduce memory usage and improve training efficiency.                                                  |
-| **Sequence parallelism**                     | Designed to share the memory and computation that cannot be sliced by model parallel, the inputs of LayerNorm and Dropout in the Transformer layer are sliced according to the sequence dimension to reduce the memory pressure on a single device.        |
-| **[Long sequence parallelism](#long-sequence-parallelism)** | Slices all inputs and output activations by sequence to further reduce the GPU memory usage of the model for processing long sequence inputs.|
-| **[Multi-copy parallelism](https://www.mindspore.cn/docs/en/r2.6.0/features/parallel/pipeline_parallel.html#mindspore-interleaved-pipeline-scheduler)**                  | Implements fine-grained parallel control among multiple copies to optimize performance and resource utilization. This mode is suitable for efficient training of models with large specifications.                                    |
-
-For details about how to configure distributed parallel parameters, see [MindSpore Transformers Configuration Description](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/appendix/conf_files.html).
-
-## Introduction to Parallel Characterization
-
-### Long Sequence Parallelism
-
-From generative AI to scientific models, long sequence training is becoming very important. Existing parallel methods such as data, tensor and pipelining cannot slice in the sequence dimension. As the sequence dimension (S) grows, the training memory overhead grows at the rate of O($S^2$). Sequence parallelism slices all inputs and output activations in the sequence dimension, which is used to reduce the limitations on the length of the input sequences and efficiently support ultra-long sequence training.
-
-#### Ring Attention Sequence Parallelism
-
-Long Sequence Parallel Algorithm, Ring Attention, is a representative technique for long sequence parallelism in the current industry, which is used to solve the memory overhead problem during long sequence training, while realizing computation and communication masking. The Ring Attention algorithm utilizes the chunking property of Attention, when the sequence parallelism is N, Q, K, V are sliced into N sub-chunks, and each card calls the Flash Attention algorithm to compute the Attention result of the local QKV sub-chunks respectively. Since each card only needs to compute the Attention of the sliced QKV sub-chunks, its memory occupation is reduced significantly. Ring Attention uses ring communication to collect and send sub-chunks to neighboring cards while doing FA computation to maximize the masking of computation and communication, which guarantees the overall performance of long sequence parallelism.
-
-MindSpore Transformers has support for configuring Ring Attention sequence parallel schemes, which can be enabled with the following configuration item:
-
-```yaml
-model:
-  model_config:
-    ...
-    use_ring_attention: True
-    ...
-parallel_config:
-  ...
-  context_parallel: 2
-  ...
-```
-
-Parameter Descriptions:
-
-- use_ring_attention: Whether to enable Ring Attention, default is False.
-- context_parallel:  The number of sequence parallel slices, default is 1, configure according to user requirements.
-
-For configuration method of distributed parallel parameters, refer to the contents of the Parallel Configuration section in [MindSpore Transformers configuration description](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/appendix/conf_files.html).
-
-#### Ulysses Sequence Parallelism
-
-The [Ulysses long sequence parallelism scheme](https://arxiv.org/abs/2309.14509) proposed by DeepSpeed slices the individual samples in the seq dimension to different compute cards; then, prior to the attention computation, an all-to-all communication operation is performed on the QKVs to allow each compute card to receive the complete sequence, allowing each computation card to compute different attention heads in parallel. Finally, another all-to-all is used after the ATTENTION computation to collect results on the attention head while re-slicing on the seq dimension. This scheme effectively extends the length of the trained sequences while keeping the communication relatively low.
-
-MindSpore Transformers has support for configuring the Ulysses Sequence Parallel Scheme, which can be enabled with the following configuration item:
-
-```yaml
-model:
-  model_config:
-    ...
-    use_attn_mask_compression: True # Enable attention_mask compression
-    ...
-parallel:
-  ...
-  enable_alltoall: True  # Allow inputting of alltoall operator
-  ...
-parallel_config:
-  ...
-  context_parallel: 2
-  context_parallel_algo: ulysses_cp  # Enable Ulysses sequence parallelism
-  ...
-```
-
-Parameter Descriptions:
-
-- use_attn_mask_compression: Whether to mask the Score matrix in Self-Attention, default is False, it is recommended to turn it on to reduce the video memory usage in Ulysses sequence parallel scheme.
-- enable_alltoall: Generate alltoall communication operator, default is False, when the parameter is not enabled, it will be replaced by a combination of other operators such as allgather. See MindSpore `set_auto_parallel_context` [interface documentation](https://www.mindspore.cn/docs/en/r2.6.0/api_python/mindspore/mindspore.set_auto_parallel_context.html). We expect to be able to directly input allto_all communication operators when we enable the Ulysses scenario, so we turn this configuration item on.
-- context_parallel_algo: Set to `ulysses_cp` to enable Ulysses sequence parallelism.
-
-For configuration method of distributed parallel parameters, refer to the contents of the Parallel Configuration section in [MindSpore Transformers configuration description](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/appendix/conf_files.html).
-
-#### Hybrid Sequence Parallelism
-
-Currently, both Ulysses and Ring Attention sequence parallel schemes have certain limitations. Although Ring Attention sequence parallel scheme can theoretically expand the sequence length infinitely, the communication and computation bandwidth utilization is low, and the performance is inferior to that of Ulysses sequence parallel scheme when the sequence block size is low. The sequence parallelism of Ulysses in GQA and MQA scenarios is limited by the number of Heads and the expansion of sequence length is limited. Hybrid sequence parallelism fuses Ulysses and Ring Attention sequence parallelism scheme, which can solve the above defects.
-
-MindSpore Transformers has support for configuring hybrid sequence parallel schemes, which can be enabled with the following configuration items:
-
-```yaml
-parallel:
-  ...
-  enable_alltoall: True  # Allow inputting of alltoall operator
-  ...
-parallel_config:
-  ...
-  context_parallel: 16
-  context_parallel_algo: hybrid_cp  # Enable hybrid sequence parallel
-  ulysses_degree_in_cp: 8
-  ...
-```
-
-Parameter Descriptions:
-
-- context_parallel_algo: hybrid sequence parallelism is turned on when set to `hybrid_cp`.
-- ulysses_degree_in_cp: the number of parallel slices of the Ulysses sequence.
-
-For configuration method of distributed parallel parameters, refer to the contents of the Parallel Configuration section in [MindSpore Transformers configuration description](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/appendix/conf_files.html).
-
-### Pipeline Parallelism
-
-#### Sequence Pipeline Parallelism (Seq-Pipe)
-
-The model inputs are segmented along the sequence dimension and unfolded into multiple sequence chunks. In the original 1F1B (One Forward One Backward) and 1F1B-Interleave methods, the scheduling unit is reduced to a Sequence Chunk. `seq_split_num` represents the number of Sequence Chunk; when `seq_split_num`=1, it degrades to 1F1B or 1F1B-Interleave.
-
-MindSpore Transformers supports configuring the Seq-Pipe pipeline parallelism, which can be enabled through the following configuration items:
-
-```yaml
-# parallel context
-parallel:
-  pipeline_config:
-    pipeline_interleave: true
-    pipeline_scheduler: 'seqpipe'
-
-# parallel config
-parallel_config:
-  seq_split_num: 2
-```
-
-Parameter Descriptions:
-
-- pipeline_scheduler: The scheduling strategy for the pipeline, currently, mindspore transformers only supports setting this to `"seqpipe"`.
-- seq_split_num: The number of Sequence Chunk which splits along the sequence dimension of the input.
-
-Notes:
-
-- Currently, only Llama and DeepSeek series models are supported.
-- Using Megatron's multi-source datasets for training is not yet supported.
-
-For more information on configuring distributed parallel parameters, see the [MindSpore Transformers configuration description](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/appendix/conf_files.html), specifically the section on parallel configuration.
-
-## MindSpore Transformers Distributed Parallel Application Practices
-
-In the [Llama3-70B fine-tuning configuration](https://gitee.com/kong_de_shu/mindformers/blob/dev/research/llama3/llama3_70b/finetune_llama3_70b.yaml#) file provided on the official website, multiple distributed parallelism strategies are used to improve the training efficiency in the multi-node multi-device environment. The main parallelism strategies and key parameters involved in the configuration file are as follows:
-
-- **Data parallelism**: No additional data parallelism is enabled (`data_parallel: 1`).
-- **Model parallelism**: A model is sliced into eight parts, which are computed on different devices (`model_parallel: 8`).
-- **Pipeline parallelism**: A model is divided into eight pipeline phases, which run on different devices in sequence (`pipeline_stage: 8`).
-- **Sequence parallelism**: After it is enabled (`use_seq_parallel: True`), the inputs of LayerNorm and Dropout at the Transformer layer are sliced by sequence. In this way, each device only needs to process part of LayerNorm and Dropout, reducing the model GPU memory usage.
-- **Multi-copy parallelism**: Sequential scheduling algorithm is used to control the parallelism of fine-grained multi-branch operations (`fine_grain_interleave: 2`), improving the overlap of computing and communications.
-- **Optimizer parallelism**: The calculation of optimizers is distributed to multiple devices to reduce memory usage (`enable_parallel_optimizer: True`).
-
-> Note: Sequential parallelism must be turned on at the same time that fine-grained multicopy parallelism is turned on.
-
-With the preceding configurations, the distributed training on Llama3-70B can effectively utilize hardware resources in a multi-node multi-device environment to implement efficient and stable model training.
diff --git a/docs/mindformers/docs/source_en/function/fine_grained_activations_swap.md b/docs/mindformers/docs/source_en/function/fine_grained_activations_swap.md
deleted file mode 100644
index e438785bbae3c2fe86341dcdfeb61850725a8e27..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_en/function/fine_grained_activations_swap.md
+++ /dev/null
@@ -1,272 +0,0 @@
-# Fine-Grained Activations SWAP
-
-[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_en/function/fine_grained_activations_swap.md)
-
-## Overview
-
-In traditional large-scale model training tasks, the memory resources of computing cards often become a bottleneck. Although adopting larger-scale model parallel (mp) and pipeline parallel (pp) can alleviate the memory pressure on individual computing cards to some extent, it requires larger-scale cluster resources, and excessive communication can significantly reduce the model's Model FLOPs Utilization (MFU). Under limited cluster resources, recomputation is another effective method to mitigate memory pressure. It reduces the memory footprint of activations by discarding the storage of activation values during the forward propagation phase and recomputing the required activation values during gradient backpropagation. However, since recomputation introduces additional computational overhead, this method also significantly decreases the MFU of model training.
-
-Against this backdrop, fine-grained activations SWAP can provide a third effective approach to reduce memory usage while offering greater end-to-end performance advantages. Specifically, SWAP offloads activations that need to be stored long-term to the host side during the forward propagation phase and prefetches them back to the device side in advance when they are needed during backpropagation. In terms of resource utilization, fine-grained activations SWAP leverages D2H/H2D bandwidth, which can overlap with computation tasks and D2D communication tasks during training, thereby masking the overhead of memory transfers.
-
-The fine-grained activations SWAP technology offers high flexibility in usage. During the forward propagation phase of large model training, multiple activations of varying data sizes are generated, allowing users to swap specific activations at the granularity of the operator selectively. When the model type or configuration changes, users can flexibly adjust the corresponding SWAP strategy to minimize memory overhead and achieve optimal performance.
-
-## Instrunction for Use
-
-### Constraint Scenarios
-
-- Only support static graph O0/O1 mode
-- Compatible with LLama-family dense models, MoE sparse models to be supported in future updates  
-- Somas does not support heterogeneity and needs to be set in the configuration file:
-
-  ```yaml
-  context:
-    memory_optimize_level=O0
-  ```
-
-- When pipeline parallelism is disabled, the lazy_inline scenario must be enabled by setting the environment variable:
-
-  ```bash
-  ENABLE_LAZY_INLINE_NO_PIPELINE=1
-  ```
-
-- Only support Ascend backend
-
-### Instruction for API
-
-Fine-grained activations SWAP is enabled through the `swap_config` field in YAML configuration, which includes four functional interfaces: `swap`, `default_prefetch`, `layer_swap`, and `op_swap`. These interfaces allow users to flexibly enable SWAP for specific layers or specific operators within layers.  
-
-> MindSpore framework currently decouples memory offloading and memory release. When activations are offloaded from the device side to the host side, the memory space occupied on the device side is not immediately released even after all data has been transferred. An explicit release operation is required instead. Before triggering the memory release, the system checks whether the activation offloading is complete. If not, the process will wait in place until the offloading finishes.
-
-| Configuration Item | Type | Description |
-|:--:|:--:|:---|
-| swap | bool | Default False. When set to False, all four functional interfaces are disabled. When set to True, activations SWAP is enabled, and the system checks whether layer_swap and op_swap are None. If both are None, the default SWAP strategy is applied, which enables SWAP for the flash_attention operator across all layers. If either layer_swap or op_swap has a non-None value, the default policy is overridden, and SWAP is enabled according to the configurations in layer_swap and op_swap. |
-| default_prefetch | int | Default 1 and only takes effect when swap=True, layer_swap=None, and op_swap=None. It controls the timing of releasing memory in forward phase and starting prefetch in backward phase of the default SWAP strategy. A larger `default_prefetch` delays memory release during the forward phase, keeping device memory occupied by activations locked for an extended period after offloading, preventing reuse by other data blocks. It also starts earlier prefetching from host to device during the backward phase, applying memory pressure prematurely. A smaller `default_prefetch` releases memory earlier in the forward phase but may introduce idle waiting for copy operations to complete. Additionally, delayed prefetch  in the backward phase may cause computation stalls if prefetching isn't finished before activation usage, impacting end-to-end performance. This interface allows users to fine-tune memory release and prefetch timing for optimal memory efficiency and performance.|
-| layer_swap | list | Default None. When set to None, this interface is inactive. When the type is List, this interface contains several list elements of the Dict type. Each Dict element contains two keys: `backward_prefetch`, and `layers`, and provides the prefetch opportunity and layer index for enabling swap. |
-| op_swap | list | Default None. When set to None, this interface is inactive. When the type is List, this interface contains several list elements of the Dict type. Each Dict element contains three keys: `op_name`, `backward_prefetch`, and `layers`, and provides the prefetch opportunity, operator name, and layer index for enabling swap. |
-
-### Used together with Recomputation
-
-Fine-Grained Activations SWAP and Recomputation have coupling effects:
-
-1. If any operator has both recomputation and SWAP enabled simultaneously, recomputation will take effect while SWAP will not.
-2. For any operator with SWAP enabled, if its output is used by an operator with recomputation enabled, then SWAP for that operator will not take effect.
-3. The YAML configuration interface for recomputation only supports enabling recomputation for a specific number of layers sequentially from front to back, rather than selecting specific layers or specific operators within layers. This means when using both SWAP and recomputation together, SWAP can only be enabled for later layers or operators within later layers, preventing full utilization of SWAP's benefits. Therefore, when and only when `swap=True`, the recomputation interface functionality will be adjusted as shown in the table below.
-
-| Interface Name | Original Functionality | Functionality When Enabling SWAP |
-|:--:|:---|:---|
-| recompute | Determine the number of layers with recomputation enabled in each pipeline stage. | Pipeline stage-agnostic, only accepts bool/list type inputs. When bool type: enables recomputation for all layers; when list type: uses layer indices to enable recomputation for specific layers. |
-| select_recompute | Determine the number of layers with recomputation enabled for specific operators in each pipeline stage. | Pipeline stage-agnostic, for each operator's key-value pair, only accepts bool/list type inputs. When bool type: enables recomputation for all layers; when list type: uses layer indices to enable recomputation for specific layers. |
-| select_comm_recompute | Determine the number of layers with recomputation enabled for communication operators in each pipeline stage. | Pipeline stage-agnostic, only accepts bool/list type inputs. When bool type: enables recomputation for all layers; when list type: uses layer indices to enable recomputation for specific layers. |
-
-## Cases of Fine-Grained Activations SWAP
-
-This section demonstrates the usage of fine-grained activations SWAP using Llama2-7B training as an example.
-
-### Environmental Preparation
-
-Download Mindformers, and prepare the pre-training dataset, such as wikitext.
-
-### Case 1: Default SWAP Strategy
-
-Modify and supplement the recomputation and SWAP configurations in YAML as follows:
-
-```yaml
-context:
-  memory_optimize_level: "O0"
-model:
-  model_config:
-    num_layers: 4
-recompute_config:
-  recompute: False
-  select_recompute: False
-  select_comm_recompute: False
-swap_config:
-  swap: True
-  default_prefetch: 10
-```
-
-Execute the following script to launch single-node 8-NPU training, with the script's execution path being the root directory, requiring the user to specify the YAML file path(machine_ip needs to fill in the local environment IP address):
-
-```bash
-export GLOG_v=1
-export MS_MEMORY_STATISTIC=1
-export ENABLE_LAZY_INLINE_NO_PIPELINE=1
-YAML_FILE=$1 # User specifies the YAML file path.
-ROOT_PATH=`pwd`
-
-bash ./scripts/msrun_launcher.sh "run_mindformer.py \
-    --config ${ROOT_PATH}/${YAML_FILE} \
-    --run_mode train \
-    --use_parallel True" \
-    8 8 <machine_ip> 8118 0 output/msrun False 300
-```
-
-After training completes, execute the command `cat output/msrun/worker_0.log | grep 'attention.flash_attention'` to check the execution status of the default SWAP strategy:
-
-```text
--INFO - Set op_swap at layer 0: attention.flash_attention, value=10
--INFO - Set op_swap at layer 1: attention.flash_attention, value=10
--INFO - Set op_swap at layer 2: attention.flash_attention, value=10
--INFO - Set op_swap at layer 3: attention.flash_attention, value=10
-```
-
-The default SWAP strategy is executed successfully.
-
-### Case 2: Select Specific Layers to Enable SWAP
-
-Modify and supplement the recomputation and SWAP configurations in YAML as follows:
-
-```yaml
-context:
-  memory_optimize_level: "O0"
-model:
-  model_config:
-    num_layers: 4
-recompute_config:
-  recompute: False
-  select_recompute: False
-  select_comm_recompute: False
-swap_config:
-  swap: True
-  layer_swap:
-    - backward_prefetch: 20
-      layers: [0,3]
-```
-
-Execute the following script to launch single-node 8-NPU training, with the script's execution path being the root directory, requiring the user to specify the YAML file path(machine_ip needs to fill in the local environment IP address):
-
-```bash
-export GLOG_v=1
-export MS_MEMORY_STATISTIC=1
-export ENABLE_LAZY_INLINE_NO_PIPELINE=1
-YAML_FILE=$1 # User specifies the YAML file path.
-ROOT_PATH=`pwd`
-
-bash ./scripts/msrun_launcher.sh "run_mindformer.py \
-    --config ${ROOT_PATH}/${YAML_FILE} \
-    --run_mode train \
-    --use_parallel True" \
-    8 8 <machine_ip> 8118 0 output/msrun False 300
-```
-
-After training completes, execute the command `cat output/msrun/worker_0.log | grep 'Set layer swap at'` to check the execution status of the default SWAP strategy:
-
-```text
--INFO - Set layer swap at layer 0 and value is: 20
--INFO - Set layer swap at layer 3 and value is: 20
-```
-
-The strategy of enabling SWAP for specific layers is executed successfully.
-
-### Case 3: Select Specific Operators within Layers to Enable SWAP
-
-Modify and supplement the recomputation and SWAP configurations in YAML as follows:
-
-```yaml
-context:
-  memory_optimize_level: "O0"
-model:
-  model_config:
-    num_layers: 4
-recompute_config:
-  recompute: False
-  select_recompute: False
-  select_comm_recompute: False
-swap_config:
-  swap: True
-  op_swap:
-    - op_name: 'attention'
-      backward_prefetch: 20
-      layers: [0,1,2]
-    - op_name: 'attention'
-      backward_prefetch: 10
-      layers: [3]
-    - op_name: 'feed_forward'
-      backward_prefetch: 15
-      layers: [1,2]
-```
-
-Execute the following script to launch single-node 8-NPU training, with the script's execution path being the root directory, requiring the user to specify the YAML file path(machine_ip needs to fill in the local environment IP address):
-
-```bash
-export GLOG_v=1
-export MS_MEMORY_STATISTIC=1
-export ENABLE_LAZY_INLINE_NO_PIPELINE=1
-YAML_FILE=$1 # User specifies the YAML file path.
-ROOT_PATH=`pwd`
-
-bash ./scripts/msrun_launcher.sh "run_mindformer.py \
-    --config ${ROOT_PATH}/${YAML_FILE} \
-    --run_mode train \
-    --use_parallel True" \
-    8 8 <machine_ip> 8118 0 output/msrun False 300
-```
-
-After training completes, execute the command `cat output/msrun/worker_0.log | grep 'Set op_swap at layer'` to check the execution status of the default SWAP strategy:
-
-```text
--INFO - Set op_swap at layer 0: .attention, value=20
--INFO - Set op_swap at layer 1: .attention, value=20, .feed_forward, value=15
--INFO - Set op_swap at layer 2: .attention, value=20, .feed_forward, value=15
--INFO - Set op_swap at layer 3: .attention, value=10
-```
-
-The strategy of enabling SWAP for specific operators within layers is executed successfully.
-
-### Case 4: Use Fine-Grained Activations SWAP together with Recomputation
-
-Modify and supplement the recomputation and SWAP configurations in YAML as follows:
-
-```yaml
-context:
-  memory_optimize_level: "O0"
-model:
-  model_config:
-    num_layers: 4
-recompute_config:
-  recompute: False
-  select_recompute:
-    'feed_forward': [0,3]
-  select_comm_recompute: False
-swap_config:
-  swap: True
-  op_swap:
-    - op_name: 'attention'
-      backward_prefetch: 20
-      layers: [0,1,2]
-    - op_name: 'attention'
-      backward_prefetch: 10
-      layers: [3]
-    - op_name: 'feed_forward'
-      backward_prefetch: 15
-      layers: [1,2]
-```
-
-Execute the following script to launch single-node 8-NPU training, with the script's execution path being the root directory, requiring the user to specify the YAML file path(machine_ip needs to fill in the local environment IP address):
-
-```bash
-export GLOG_v=1
-export MS_MEMORY_STATISTIC=1
-export ENABLE_LAZY_INLINE_NO_PIPELINE=1
-YAML_FILE=$1 # User specifies the YAML file path.
-ROOT_PATH=`pwd`
-
-bash ./scripts/msrun_launcher.sh "run_mindformer.py \
-    --config ${ROOT_PATH}/${YAML_FILE} \
-    --run_mode train \
-    --use_parallel True" \
-    8 8 <machine_ip> 8118 0 output/msrun False 300
-```
-
-After training completes, execute the command `cat output/msrun/worker_0.log | grep 'Set op_swap at layer' -C 1` to check the execution status of the default SWAP strategy:
-
-```text
--INFO - Set select recompute at layer 0: feed_forward
--INFO - Set op_swap at layer 0: .attention, value=20
--INFO - Set op_swap at layer 1: .attention, value=20, .feed_forward, value=15
--INFO - Set op_swap at layer 2: .attention, value=20, .feed_forward, value=15
--INFO - Set select recompute at layer 3: feed_forward
--INFO - Set op_swap at layer 3: .attention, value=10
-```
-
-The strategy of enabling fine-grained activations SWAP together with recomputation is executed successfully.
diff --git a/docs/mindformers/docs/source_en/function/high_availability.md b/docs/mindformers/docs/source_en/function/high_availability.md
deleted file mode 100644
index 91402c96a76e77244d9c4b7e91cfaaaaa3c4553d..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_en/function/high_availability.md
+++ /dev/null
@@ -1,236 +0,0 @@
-# High Availability
-
-[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_en/function/high_availability.md)
-
-## Overview
-
-MindSpore Transformers high availability provides the following three functions:
-
-- **End-of-life CKPT**: It is mainly aimed at accelerating the fault recovery in the training process of large models. This feature verifies the integrity and consistency of the intermediate state data after a fault occurs during the training process and generates an end-of-life CheckPoint data, which can be used to recover the training and reduce the loss of training iterations caused by the fault.
-- **UCE Fault-tolerant Recovery**: It mainly focuses on the detection of UCE faults in on-chip memory during the training process of large models, and accomplishes online repair to reach Step-level recomputation.
-- **Process-Level Rescheduling Recovery**: Instead of pulling up the entire cluster again after an anomaly in training occurs, simply restart or replace it on a node-by-node basis to complete the repair and continue training.
-
-The high availability feature is currently only supported in the MindSpore Ascend back-end graph schema; this feature also needs to support Step-level recovery, so only a sink_size of 1 is supported when configuring data sinking.
-
-The high availability feature is based on the existence of a replica relationship between the two cards so that when one of the cards fails, it can be recovered from the other card, and therefore there will be two copies of redundancy in both the weights and the optimizer, which will take up more video memory. To ensure this redundancy relationship, data parallelism must be turned on to ensure that there are two cards with the same weights, and also if optimizer parallelism is turned on, it must be ensured that there are two cards with the same optimizer state.
-
-All three functions can be turned on at the same time or individually. When these three functions are turned on in combination, the order in which they take effect is: UCE Fault Tolerance Recovery -> Process-Level Rescheduling Recovery -> End-of-Life CKPT, and if one of the functions can be recovered, the next function will not be executed. The end-of-life CKPT function serves as a final safeguard, and the entire training process exits upon completion of this function, so it will be turned on by default when the other two functions are turned on.
-
-The end-of-life CKPT saving of the Checkpoint file and the renewal of training from that file use the existing MindSpore Transformers capabilities in the same way, except that the end-of-life CKPT relies on the strategy file, so that folder needs to be configured for both the training and the renewal of the training.
-
-When an exception triggers an end-of-life CheckPoint save, if de-redundant saving is not turned on, only one card in each data parallel field saves the CheckPoint, and the rest of the cards do not save the CheckPoint. Therefore, when resuming training, it is also necessary to enable the high availability feature in order to resume, otherwise the other cards will not be able to find the available CheckPoint and will report an error exit. Users can determine whether a CheckPoint is triggered by the end-of-life CKPT feature by calculating whether the number of CheckPoints saved by the distribution is less than the number of clusters.
-
-## Instructions for Use
-
-The high availability feature switch is enabled by an environment variable, and the switch is not set separately in the YAML configuration file, but the YAML file needs to be able to configure the weights and optimizer states to be the same for both cards, as detailed in the [Replica Relationships Configuration](#replica-relationships-configuration) section of this document.
-
-The high availability feature relies on the user to install the MindIO TFT SDK package. Please refer to [Install MindIO TFT SDK on compute nodes](https://www.hiascend.com/document/detail/zh/mindx-dl/600/clusterscheduling/ref/mindiottp/mindiotft011.html).
-
-### Environment Variable Configuration
-
-```shell
-export MINDIO_FOR_MINDSPORE=1
-export MS_ENABLE_TFT="{TTP:1,UCE:1,ARF:1}"
-export MS_TFT_IP=127.0.0.1
-export MS_TFT_PORT=30051
-```
-
-- `MINDIO_FOR_MINDSPORE`: Enabling MindIO TFT SDK to support MindSpore
-- `MS_ENABLE_TFT`: Indicates that the TTP, UCE and ARF functions are enabled. If you want to enable only one of these functions, set the corresponding value to 1.
-    - **TTP (Try To Persist)**: End-of-life CKPT function
-    - **UCE (Uncorrectable Memory Error)**: UCE fault tolerance recovery
-    - **ARF (Air Refuelling)**: Process-level rescheduling recovery function
-    - When UCE or ARF is enabled, TTP is enabled by default.
-
-- `MS_TFT_IP` and `MS_TFT_PORT` represent the IP and port number of TFT Controller respectively, no default value, need to be specified by user. If the Controller is started by MindSpore Transformers, the IP and port number of the rank0 node in the user's cluster are configured. If the Controller is started by the user, configure the IP and port number of the Controller.
-
-### YAML Configuration
-
-The YAML configuration consists of two parts: the end-of-life CKPT saving and recovery configuration and the highly available replica relationship configuration.
-
-#### Saving and Restoring Configurations
-
-The end-of-life CheckPoint preservation and recovery capabilities are used for initial and renewal training respectively, which reuse the existing MindSpore Transformers configuration, and the following describes the configuration for initial and renewal training respectively.
-
-- **Initial Training Configuration**
-
-    ```yaml
-    output_dir: './output' # The directory where CheckPoints and Strategies are stored
-    load_checkpoint: ''    # Configuration is empty for initial training
-    src_strategy_path_or_dir: '/output/strategy/'
-    only_save_strategy: False
-    resume_training: False  # Configuration is False for initial training
-    run_mode: 'train'
-
-    callbacks:
-      - type: CheckpointMonitor
-        prefix: "llama2_13b"
-        save_checkpoint_steps: 100
-        integrated_save: False
-        async_save: False
-    ```
-
-- **Renewal Training Configuration**
-
-    ```yaml
-    output_dir: './output' # The directory where CheckPoints and Strategies are stored
-    load_checkpoint: './output/checkpoint/'   # Configure CheckPoint paths during renewal training
-    src_strategy_path_or_dir: '/output/strategy/'
-    only_save_strategy: False
-    resume_training: True  # Configured to True for renewal training
-    run_mode: 'train'
-
-    callbacks:
-      - type: CheckpointMonitor
-        prefix: "llama2_13b"
-        save_checkpoint_steps: 100
-        integrated_save: False
-        async_save: False
-    ```
-
-#### Replica Relationships Configuration
-
-The key to the three functions of high availability is to configure the weight and optimizer copy redundancy relationship. The core of the configuration is that the dimension of the data parallel domain is greater than 2, and if you overlay the optimizer parallelism, you need to ensure that the number of copies of the optimizer is greater than 2 at the same time. So the configuration is divided into two categories, with the optimizer parallelism and without the optimizer parallelism. The following is an example of how to configure 8 cards.
-
-- **Without the Optimizer Parallelism**
-
-    Data parallelism dp configured as a multiple of 2 is sufficient, so that there will exist two cards with the same weights and optimizer state.
-
-    ```yaml
-    parallel:
-      enable_parallel_optimizer: False
-    parallel_config:
-      data_parallel: 2
-      model_parallel: 4
-      pipeline_stage: 1
-    ```
-
-- **With the Optimizer Parallelism**
-
-    After turning on the optimizer parallelism you must ensure that a copy of the optimizer state exists, the key to configure is optimizer_weight_shard_size to 2. The number of copies of the optimizer state at this point is data_parallel/optimizer_weight_shard_size. Therefore, if the data parallelism is configured to 2, there is no optimizer replica, and the data parallelism must be configured to 4; the number of replicas in this case is data_parallel/optimizer_weight_shard_size = 4/2 = 2.
-
-    ```yaml
-    parallel:
-      enable_parallel_optimizer: True
-      parallel_optimizer_config:
-        optimizer_weight_shard_size: 2
-    parallel_config:
-      data_parallel: 4
-      model_parallel: 2
-      pipeline_stage: 1
-    ```
-
-#### Examples
-
-This section demonstrates the use of the end-of-life CKPT using Llama2-13B training as an example.
-
-1. First install MindSpore and MindIO
-2. Download MindSpore Transformers and modify the `configs/llama2/pretrain_llama2_13b_bf16.yaml` configuration file with the following main configuration:
-
-    ```yaml
-    # runner config
-    runner_config:
-      epochs: 2
-      batch_size: 4
-      sink_mode: True
-      sink_size: 1
-
-    # ......
-
-    # parallel context config
-    parallel:
-      parallel_mode: 1 # 0-data parallel, 1-semi-auto parallel, 2-auto parallel, 3-hybrid parallel
-      gradients_mean: False
-      enable_alltoall: False
-      full_batch: True
-      search_mode: "sharding_propagation"
-      enable_parallel_optimizer: True
-      strategy_ckpt_save_file: "./ckpt_strategy.ckpt"
-      parallel_optimizer_config:
-        gradient_accumulation_shard: False
-        parallel_optimizer_threshold: 64
-        optimizer_weight_shard_size: 4
-
-    # ......
-
-    # default parallel of device num = 16 for Atlas 800T A2
-    parallel_config:
-      data_parallel: 8
-      model_parallel: 1
-      pipeline_stage: 1
-      use_seq_parallel: False
-      micro_batch_num: 1
-      vocab_emb_dp: True
-      gradient_aggregation_group: 4
-    ```
-
-    The following key points need to be noted:
-
-    - `sink_size: 1`: Features such as end-of-life CKPT and UCE fault-tolerant recovery do not support scenarios where `sink_size` is greater than 1, so it is configured as 1 here.
-    - `enable_parallel_optimizer: True`: Enable optimizer parallelism.
-    - `optimizer_weight_shard_size: 4`: The slice size of optimizer parallelism is 4.
-    - `data_parallel: 8`: Data parallelism is configured as 8.
-
-    As explained in the previous section, the value of `data_parallel/optimizer_weight_shard_size` is `8 / 4 = 2`, which is greater than 1, so there is a replica relationship.
-3. Execute the following command to start the training
-
-    ```bash
-    export MINDIO_FOR_MINDSPORE=1
-
-    export MS_ENABLE_TFT="{TTP:1,UCE:1,ARF:1}"
-    export MS_TFT_IP=127.0.0.1
-    export MS_TFT_PORT=30051
-
-    bash scripts/msrun_launcher.sh "run_mindformer.py \
-      --config configs/llama2/pretrain_llama2_13b_bf16.yaml \
-      --train_dataset_dir "/YourDataSetPath" \
-      --use_parallel True --run_mode train" 8
-    ```
-
-    Note: You need to replace `/YourDataSetPath` with the path of the actual dataset.
-4. After a few steps of training, terminate the worker process and trigger an end-of-life CKPT save
-
-    Note: With the above startup method, the MindIO Controller is attached to worker 0. In this case, worker 0 cannot be terminated, or else the MindIO Controller will exit and the end-of-life CKPT cannot be triggered. However, when training is started via taskd, the MindIO Controller is a separate process and the worker 0 process can be terminated.
-5. Confirm end-of-life CheckPoint generation
-
-    At the end of the entire training process, the reasonableness of the final generated CheckPoint file is confirmed through the log as follows:
-
-    1). Execute the command `find output/checkpoint/ -name '*.ckpt'` to find the generated CheckPoint file:
-
-    ```text
-    $ find output/checkpoint/ -name '*.ckpt'
-    output/checkpoint/rank_2/llama2_13b_rank_2-5_1.ckpt
-    output/checkpoint/rank_3/llama2_13b_rank_3-5_1.ckpt
-    output/checkpoint/rank_0/llama2_13b_rank_0-5_1.ckpt
-    output/checkpoint/rank_5/llama2_13b_rank_5-5_1.ckpt
-    ```
-
-    2). Execute the command `cat output/msrun_log/worker_0.log | grep 'Epoch:'` to see the trained steps:
-
-    ```text
-    $ cat output/msrun_log/worker_0.log | grep 'Epoch:'
-    2025-04-07 15:34:27,308 - [mindformers/core/callback/callback.py:529] - INFO - { Epoch:[  1/  2], step:[    1/   19], loss: 10.649, per_step_time: 103328ms, lr: 0.0, overflow cond: False, loss_scale: 1.0, global_norm: [1 31049], train_throughput_per_npu: 2.896T
-    2025-04-07 15:34:29,173 - [mindformers/core/callback/callback.py:529] - INFO - { Epoch:[  1/  2], step:[    2/   19], loss: 10.633, per_step_time: 1752ms, lr: 1e-05, overflow cond: False, loss_scale: 1.0, global_norm: [1 508834], train_throughput_per_npu: 170.738T
-    2025-04-07 15:34:30,941 - [mindformers/core/callback/callback.py:529] - INFO - { Epoch:[  1/  2], step:[    3/   19], loss: 9.673, per_step_time: 1754ms, lr: 9.981987e-06, overflow cond: False, loss_scale: 1.0, global_norm [10.579812], train_throughput_per_npu: 170.523T
-    2025-04-07 15:34:32,704 - [mindformers/core/callback/callback.py:529] - INFO - { Epoch:[  1/  2], step:[    4/   19], loss: 9.287, per_step_time: 1756ms, lr: 9.928079e-06, overflow cond: False, loss_scale: 1.0, global_norm [21.932272], train_throughput_per_npu: 170.319T
-    2025-04-07 15:34:34,469 - [mindformers/core/callback/callback.py:529] - INFO - { Epoch:[  1/  2], step:[    5/   19], loss: 8.867, per_step_time: 1758ms, lr: 9.8386645e-06, overflow cond: False, loss_scale: 1.0, global_norm [16.986555], train_throughput_per_npu: 170.173T
-    ```
-
-    3). Execute the command `cat output/msrun_log/worker_0.log | grep 'report group list:'` to see the replica relationships of MindIO output in the log:
-
-    ```text
-    $ cat output/msrun_log/worker_0.log | grep 'report group list:'
-    2025-04-07 15:34:27.363613 info 1879138 [TTP controller.cpp:1512] rank:4, report group list: [0, 4]
-    2025-04-07 15:34:27.385564 info 1879139 [TTP controller.cpp:1512] rank:7, report group list: [3, 7]
-    2025-04-07 15:34:27.393198 info 1879136 [TTP controller.cpp:1512] rank:6, report group list: [2, 6]
-    2025-04-07 15:34:27.393515 info 1879142 [TTP controller.cpp:1512] rank:1, report group list: [1, 5]
-    ```
-
-    From the training step information above, we can see that the 5 steps that have been trained, and the number is the same as the 5 in the file name `llama2_13b_rank_2-5_1.ckpt` of CheckPoint.
-
-    The copy relations `[0, 4]`, `[3, 7]`, `[2, 6]` and `[1, 5]` are known from the output in the log:
-
-    - The rank 0 and rank 4 weights have a replica relationship, and the end-of-life checkpoint is stored in rank 0.
-    - The rank 3 and rank 7 weights have a replica relationship, and the end-of-life checkpoint is stored in rank 3.
-    - The rank 2 and rank 6 weights have a replica relationship, and the end-of-life checkpoint is stored in rank 2.
-    - There is a replica relationship between rank 1 and rank 5 weights, and since worker 1 terminates, the final checkpoint is stored in rank 5.
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_en/function/image/wikitext_sample.png b/docs/mindformers/docs/source_en/function/image/wikitext_sample.png
deleted file mode 100644
index ea2a38a93b3ac3d3ad1d96e1a4c5afae8868291e..0000000000000000000000000000000000000000
Binary files a/docs/mindformers/docs/source_en/function/image/wikitext_sample.png and /dev/null differ
diff --git a/docs/mindformers/docs/source_en/function/monitor.md b/docs/mindformers/docs/source_en/function/monitor.md
deleted file mode 100644
index ebaf720cc05215ba823061bc91b15062f08e67a0..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_en/function/monitor.md
+++ /dev/null
@@ -1,223 +0,0 @@
-# Training Metrics Monitoring
-
-[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_en/function/monitor.md)
-
-MindSpore Transformers supports TensorBoard as a visualization tool for monitoring and analyzing various metrics and information during training. TensorBoard is a standalone visualization library that requires the user to manually install it, and it provides an interactive way to view loss, precision, learning rate, gradient distribution, and a variety of other things in training. After the user configures TensorBoard in the training `yaml` file, the event file is generated and updated in real time during the training of the large model, and the training data can be viewed via commands.
-
-## Configuration Descriptions
-
-Configure the "monitor_config", "tensorboard" and "callbacks" keywords in the training `yaml` file, and the training will save the tensorboard event file under the configured save address.
-A sample configuration is shown below:
-
-### Configuration Sample of `yaml` File
-
-```yaml
-seed: 0
-output_dir: './output'
-
-monitor_config:
-    monitor_on: True
-    dump_path: './dump'
-    target: ['layers.0', 'layers.1'] # Monitor only the first and second level parameters
-    invert: False
-    step_interval: 1
-    local_loss_format: ['log', 'tensorboard']
-    local_norm_format: ['log', 'tensorboard']
-    device_local_norm_format: ['log', 'tensorboard']
-    optimizer_state_format: null
-    weight_state_format: null
-    throughput_baseline: null
-    print_struct: False
-
-tensorboard:
-    tensorboard_dir: 'worker/tensorboard'
-    tensorboard_queue_size: 10
-    log_loss_scale_to_tensorboard: True
-    log_timers_to_tensorboard: True
-
-callbacks:
-    - type: MFLossMonitor
-      per_print_times: 1
-```
-
-| monitor_config field parameter name                    | Descriptions                                                                                       | Types            |
-|-----------------------------------------|------------------------------------------------------------------------------------------|---------------|
-| monitor_config.monitor_on               | Sets whether monitoring is enabled. The default is `False`, when all the following parameters do not take effect                                                          | bool          |
-| monitor_config.dump_path                | Sets the path where the `local_norm`, `device_local_norm`, and `local_loss` metrics files are saved during training. When not set or set to `null` take the default value '. /dump' | str           |
-| monitor_config.target                   | Sets the name (fragment) of the target parameter monitored by the indicator `optimizer_state` and `local_norm`, which can be a regular expression. When not set or set to `null` take the default value ['. *'], i.e. specify all parameters        | list[str]     |
-| monitor_config.invert                   | Sets the parameter specified by counterselecting `monitor_config.target`. Defaults to `False`.                                             | bool          |
-| monitor_config.step_interval            | Sets the frequency of logging the indicator. Default is 1, i.e., record once per step                                                               | int           |
-| monitor_config.local_loss_format        | Sets the logging form of the indicator `local_loss`                                                                    | str or list[str] |
-| monitor_config.local_norm_format        | Sets the logging form of the indicator `local_norm`                                         | str or list[str] |
-| monitor_config.device_local_norm_format | Sets the logging form of the indicator `device_local_norm`                            | str or list[str] |
-| monitor_config.optimizer_state_format   | Sets the logging form of the indicator `optimizer_state`                                           | str or list[str] |
-| monitor_config.weight_state_format      | Sets the logging form of the indicator `weight L2-norm`                                        | str or list[str] |
-| monitor_config.throughput_baseline      | Sets the baseline value for the metric `throughput linearity`, which needs to be positive. It will be written to both Tensorboard and logs. Defaults to `null` when not set, indicating that the metric is not monitored                     | int or float     |
-| monitor_config.print_struct             | Sets whether to print all trainable parameter names for the model. If `True`, it will print the names of all trainable parameters at the start of the first step and exit training at the end of the step. Default is `False`.            | bool          |
-
-The optional values for the parameters of the form xxx_format above are the strings 'tensorboard' and 'log' (for writing to the Tensorboard and writing to the log, respectively), or a list of both, or `null`. All default to `null` when not set, indicating that the corresponding metrics are not monitored.
-
-**Note**: when monitoring `optimizer_state` and `weight L2 norm` metrics is enabled, it will greatly increase the time consumption of the training process, so please choose carefully according to your needs. "rank_x" directory under the `monitor_config.dump_path` path will be cleared, so make sure that there is no file under the set path that needs to be kept.
-
-| tensoraboardfield parameter name                    | Descriptions       | Types            |
-|-----------------------------------------|---------------------|---------------|
-| tensorboard.tensorboard_dir               | Sets the path where TensorBoard event files are saved                              | str  |
-| tensorboard.tensorboard_queue_size        | Sets the maximum cache value of the capture queue. If it exceeds this value, it will be written to the event file, the default value is 10.                      | int  |
-| tensorboard.log_loss_scale_to_tensorboard | Sets whether loss scale information is logged to the event file, default is `False`.                 | bool |
-| tensorboard.log_timers_to_tensorboard     | Sets whether to log timer information to the event file. The timer information contains the duration of the current training step (or iteration) as well as the throughput, defaults to `False` | bool |
-
-`tensorboard.tensorboard_dir` can be specified via the environment variable 'MA_SUMMARY_LOG_DIR', at which point a default `tensorboard` configuration will be automatically generated if `tensorboard` is not configured.
-It should be noted that without the `tensorboard` configuration, the "tensorboard" set in xxx_format by `monitor_config` will be replaced with "log", i.e., instead of writing to the tensorboard event file, the corresponding information will be printed in the log.
-
-## Viewing Training Data
-
-After the above configuration, the event file for each card will be saved under the path `./worker/tensorboard/rank_{id}`, where `{id}` is the rank number of each card. The event files are named `events.*`. The file contains `scalars` and `text` data, where `scalars` are the scalars of key metrics in the training process, such as learning rate, loss, etc.; `text` is the text data of all configurations for the training task, such as parallel configuration, dataset configuration, etc. In addition, according to the specific configuration, some metrics will be displayed in the log.
-
-Use the following command to start the Tensorboard Web Visualization Service:
-
-```bash
-tensorboard --logdir=./worker/tensorboard/ --host=0.0.0.0 --port=6006
-```
-
-|Parameter names   | Descriptions                                                     |
-|--------|--------------------------------------------------------|
-| logdir | Path to the folder where TensorBoard saves event files                                |
-| host   | The default is 127.0.0.1, which means that only local access is allowed; setting it to 0.0.0.0 allows external devices to access it, so please pay attention to information security. |
-| port   | Set the port on which the service listens, the default is 6006.                                               |
-
-The following is displayed when the command in the sample is entered:
-
-```shell
-TensorBoard 2.18.0 at http://0.0.0.0:6006/ (Press CTRL+C to quit)
-```
-
-`2.18.0` indicates the version number of the current TensorBoard installation (the recommended version is `2.18.0`), and `0.0.0.0` and `6006` correspond to the input `--host` and `--port` respectively, after which you can visit `server public ip:port` in the local PC's browser to view the visualization page. For example, if the public IP of the server is `192.168.1.1`, then access `192.168.1.1:6006`.
-
-### Explanation of the Visualization of Indicators
-
-The callback functions `MFLossMonitor` and `TrainingStateMonitor` will monitor different scalar metrics respectively. The `TrainingStateMonitor` does not need to be set by the user in the configuration file, it will be added automatically according to monitor_config.
-
-#### MFLossMonitor Monitoring Metrics
-
-The names and descriptions of the metrics monitored by `MFLossMonitor` are listed below:
-
-| Scalar name          | Descriptions                                                  |
-|---------------|-----------------------------------------------------|
-| learning-rate | learning rate                                                 |
-| batch-size    | batch size                                                |
-| loss          | loss                                                  |
-| loss-scale    | Loss scaling factor, logging requires setting `log_loss_scale_to_tensorboard` to `True` |
-| grad-norm     | gradient exponent                                                |
-| iteration-time | The time taken for training iterations, logging requires setting `log_timers_to_tensorboard` to `True`  |
-| throughput    | Data throughput, logging requires setting `log_timers_to_tensorboard` to `True`      |
-| model-flops-throughput-per-npu | Model operator throughput in TFLOPS/npu (trillion floating point operations per second per card)                                       |
-| B-samples-per-day    | Cluster data throughput in B samples/day (one billion samples per day), logging requires setting `log_timers_to_tensorboard` to `True` |
-
-In Tensorboard SCALARS page, the above metrics (assumed to be named `scalar_name`) have drop-down tabs for `scalar_name` and `scalar_name-vs-samples`, except for the last two. A line plot of this scalar versus the number of training iterations is shown under `scalar_name`, and a line plot of this scalar versus the number of samples is shown under `scalar_name-vs-samples`. An example of a plot of learning rate `learning-rate` is shown below:
-
-![/tensorboard_scalar](../../source_zh_cn/function/image/tensorboard_scalar.png)
-
-#### TrainingStateMonitor Monitoring Metrics
-
-The names and descriptions of the metrics monitored by `TrainingStateMonitor` are listed below:
-
-| Scalar name          | Descriptions                                            |
-|----------------------|-----------------------------------------------|
-| local_norm           | Gradient paradigm for each parameter on a single card, records need to set `local_norm_format` to non-null    |
-| device_local_norm    | the total number of gradient paradigms on a single card, records need to set `device_local_norm_format` to non-null    |
-| local_loss           | localized losses on a single card, records need to set `local_loss_format` to non-null           |
-| adam_m_norm          | The optimizer's first-order moments estimate the number of paradigms for each parameter, records need to set `optimizer_state_format` to non-null |
-| adam_v_norm          | The optimizer's second-order moments estimate the number of paradigms for each parameter, records need to set `optimizer_state_format` to non-null |
-| weight_norm          | weight L2 paradigm, records need to set `weight_state_format` to non-null            |
-| throughput_linearity | data throughput linearity, records need to set `throughput_baseline` to non-null           |
-
-Depending on the specific settings, the above metrics will be displayed in the Tensorboard or logs as follows:
-
-**Example of logging effect**
-
-![/TrainingStateMonitor_log](../../source_zh_cn/function/image/TrainingStateMonitor_log.png)
-
-**Example of tensorboard visualization**
-
-adam_m_norm
-
-![/adam_m_norm](../../source_zh_cn/function/image/adam_m_norm.png)
-
-local_loss and local_norm
-
-![/local_loss&local_norm](../../source_zh_cn/function/image/local_loss&local_norm.png)
-
-### Description of Text Data Visualization
-
-On the TEXT page, a tab exists for each training configuration where the values for that configuration are recorded. This is shown in the following figure:
-
-![/tensorboard_text](../../source_zh_cn/function/image/tensorboard_text.png)
-
-All configuration names and descriptions are listed below:
-
-| Configuration names                        | descriptions                                                           |
-|----------------------------|--------------------------------------------------------------|
-| seed                       | random seed                                                         |
-| output_dir                 | Save paths to checkpoint and strategy                                     |
-| run_mode                   | running mode                                                         |
-| use_parallel               | whether to enable parallel                                                       |
-| resume_training            | whether to enable resume training                                                   |
-| ignore_data_skip           | Whether to ignore the mechanism for skipping data during breakpoints in resume training and read the dataset from the beginning. Recorded only if the `resume_training` value is `True` |
-| data_skip_steps            | The number of data set skip steps. Only logged if `ignore_data_skip` is logged and the value is `False`.               |
-| load_checkpoint            | Model name or weight path for loading weights                                                |
-| load_ckpt_format           | File format for load weights. Only logged if the `load_checkpoint` value is not null                       |
-| auto_trans_ckpt            | Whether to enable automatic online weight slicing or conversion. Only logged if the `load_checkpoint` value is not null                 |
-| transform_process_num      | The number of processes to convert the checkpoint. Only logged if `auto_trans_ckpt` is logged and the value is `True`.        |
-| src_strategy_path_or_dir   | Source weight distributed policy file path. Only logged if `auto_trans_ckpt` is logged and the value is `True`.            |
-| load_ckpt_async            | Whether to log weights asynchronously. Only logged if the `load_checkpoint` value is not null                        |
-| only_save_strategy         | Whether the task saves only distributed policy files                                               |
-| profile                    | Whether to enable performance analysis tools                                                   |
-| profile_communication      | Whether to collect communication performance data in multi-device training. Recorded only when `profile` value is `True`                   |
-| profile_level              | Capture performance data levels. Recorded only when `profile` value is `True`                            |
-| profile_memory             | Whether to collect Tensor memory data. Recorded only when `profile` value is `True`                      |
-| profile_start_step         | Performance analysis starts with step. Recorded only when `profile` value is `True`                         |
-| profile_stop_step          | Performance analysis ends with step. Recorded only when `profile` value is `True`                         |
-| profile_rank_ids           | Specify the rank ids to turn on profiling. Recorded only when `profile` value is `True`               |
-| profile_pipeline           | Whether to turn on profiling by for the cards of each stage in pipeline parallel. Recorded only when `profile` value is `True`    |
-| init_start_profile         | Whether to enable data acquisition during Profiler initialization.Recorded only when `profile` value is `True`                                      |
-| layer_decay                | Layer decay coefficient                                                        |
-| layer_scale                | whether to enable layer scaling                                                       |
-| lr_scale                   | Whether to enable learning rate scaling                                                    |
-| lr_scale_factor            | Learning rate scaling factor. Recorded only when `lr_scale` value is `True`                            |
-| micro_batch_interleave_num | Number of batch_size splits, multicopy parallelism switch                                      |
-| remote_save_url            | Return folder paths for target buckets when using AICC training jobs                                      |
-| callbacks                  | callback function configuration                                                       |
-| context                    | Configuration of the environment                                                         |
-| data_size                  | Dataset size                                                        |
-| device_num                 | Number of devices (cards)                                                     |
-| do_eval                    | Whether to turn on training-while-evaluating                                                   |
-| eval_callbacks             | Evaluate the callback function configuration. Recorded only when `do_eval` value is `True`                            |
-| eval_step_interval         | Evaluate step intervals. Recorded only when `do_eval` value is `True`                            |
-| eval_epoch_interval        | Evaluate the epoch interval. Recorded only when `do_eval` value is `True`                           |
-| eval_dataset               | Evaluate the dataset configuration. Recorded only when `do_eval` value is `True`                             |
-| eval_dataset_task          | Evaluate task configurations. Recorded only when `do_eval` value is `True`                              |
-| lr_schedule                | learning rate                                                          |
-| metric                     | evaluation function                                                         |
-| model                      | Model configuration                                                         |
-| moe_config                 | Mixed expert configurations                                                       |
-| optimizer                  | optimizer                                                          |
-| parallel_config            | Parallel strategy configuration                                                       |
-| parallel                   | Automatic parallel configuration                                                       |
-| recompute_config           | recomputation configuration                                                        |
-| remove_redundancy          | Whether redundancy is removed when checkpoint is saved                                          |
-| runner_config              | running configuration                                                         |
-| runner_wrapper             | wrapper configuration                                                    |
-| monitor_config             | Training metrics monitoring configuration                 |
-| tensorboard                | TensorBoard configuration                                                |
-| train_dataset_task         | Training task configuration                                                       |
-| train_dataset              | Training dataset configuration                                                      |
-| trainer                    | Training process configuration                                                       |
-| swap_config                | Fine-grained activations SWAP configuration |
-
-> The above training configurations are derived from:
->
-> 1. Configuration parameters passed in by the user in the training startup command `run_mindformer.py`;
-> 2. Configuration parameters set by the user in the training configuration file `yaml`;
-> 3. Default configuration parameters during training.
->
-> Refer to [Configuration File Description](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/appendix/conf_files.html) for all configurable parameters.
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_en/function/resume_training.md b/docs/mindformers/docs/source_en/function/resume_training.md
deleted file mode 100644
index 5a788a40c421b548e569b7bcc0d0fddd5a36454d..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_en/function/resume_training.md
+++ /dev/null
@@ -1,277 +0,0 @@
-# Weight Saving and Resumable Training
-
-[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_en/function/resume_training.md)
-
-## Weight Saving
-
-### Overview
-
-To train a deep learning model, saving the weights of the model is a critical step. The weight saving function enables you to store model parameters at any training stage so that you can resume training, evaluation, or deployment after the training is interrupted or completed. By saving the weights, you can also reproduce the experiment results in different environments.
-
-### Directory Structure
-
-During training, MindSpore Transformers generates two weight saving folders in the output directory: `checkpoint` and `checkpoint_network`.
-
-| Folder               | Description                                                 |
-|--------------------|-----------------------------------------------------|
-| checkpoint         | Stores the weights, optimizer status, steps, and epoches to the ckpt file for **resuming training**.        |
-| checkpoint_network | Stores only weight parameters in the ckpt file. This folder applies to **pre-trained weight** loading or **inference and evaluation** but not for resuming training.|
-
-#### `checkpoint` Directory Structure
-
-The weight file in the `checkpoint` folder is saved in the following format:
-
-```text
-checkpoint
-  ├── rank_0
-    ├── meta.json
-    └── {prefix}-{epoch}_{step}.ckpt
-  ...
-  └── rank_x
-    ├── meta.json
-    └── {prefix}-{epoch}_{step}.ckpt
-```
-
-| File                          | Description                                                                                                                                                                                                                                                |
-|------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| meta.json                    | Record the `epoch`, `step`, and name of the last saved weight. Each rank process maintains an independent `meta.json` file.                                                                                                                                                                                         |
-| {prefix}-{epoch}_{step}.ckpt | Saved weight file. `prefix` contains the rank_id information in the `{prefix}-{epoch}_{step}.ckpt` format. If a file with the same prefix already exists, the system automatically adds a suffix. When data offloading is enabled, the `epoch` location is calculated as follows: $\frac{CurrentTotalStepNumber}{SinkSize} = \frac{((CurrentEpoch-1)*StepsPerEpoch+CurrentStepInEpoch)}{SinkSize}$. `step` is fixed to `sink_size`.|
-
-#### Directory Structure of `checkpoint_network`
-
-```text
-checkpoint
-  ├── rank_0
-    └── {prefix}-{epoch}_{step}.ckpt
-  ...
-  └── rank_x
-    └── {prefix}-{epoch}_{step}.ckpt
-```
-
-| File                          | Description                                                                                                   |
-|------------------------------|-------------------------------------------------------------------------------------------------------|
-| {prefix}-{epoch}_{step}.ckpt | Saved weight file. `prefix` contains the rank_id information in the `{prefix}-{epoch}_{step}.ckpt` format. If a file with the same prefix already exists, the system automatically adds a suffix. The naming rule when data offloading is enabled is the same as the preceding naming rule.|
-
-### Configuration and Usage
-
-#### YAML Parameters
-
-You can modify the configuration file to control weight saving. The main parameters are as follows.
-
-| Parameter                   | Description                               |
-|-----------------------|-----------------------------------|
-| save_checkpoint_steps | Number of steps taken each time a weight is saved. If this parameter is not set, no weight is saved.             |
-| keep_checkpoint_max   | Maximum number of weight files that can be saved at the same time. If the number of weight files reaches the upper limit, the earliest weight file will be deleted when the latest weight file is saved.|
-
-You can modify the fields under `CheckpointMonitor` in the `yaml` configuration file to control the weight saving behavior. For example:
-
-```yaml
-callbacks:
-  ...
-  - type: CheckpointMonitor
-    prefix: "llama2_7b"
-    save_checkpoint_steps: 500
-    keep_checkpoint_max: 3
-  ...
-```
-
-In the preceding example, the weights are saved every 500 steps. A maximum of three weights can be saved at the same time.
-
-## Resumable Training
-
-### Overview
-
-MindSpore Transformers supports **step-level resumable training**, which allows the checkpoints of a model to be saved during training. If the training is interrupted, you can load a saved checkpoint to resume the training. This feature is crucial for processing large-scale training tasks, and can effectively reduce time and resource waste caused by unexpected interruptions. In addition, to resume a training where the dataset remains unchanged but the `global batch size` is changed, for example, when the cluster is changed or the configuration is modified, this tool supports automatic scaling of the number of resumable training steps and skipped data steps in the same proportion.
-
-### Configuration and Usage
-
-#### YAML Parameters
-
-You can modify the configuration file to control resumable training. The main parameters are as follows. For details about other parameters, see the description of CheckpointMonitor.
-
-| Parameter              | Description                                                                 |
-|------------------|---------------------------------------------------------------------|
-| load_checkpoint  | Weight path loaded during resumable training. The path can be a folder path (used to load distributed weights) or a specific weight file path. The default value is an empty string, indicating that no weight is loaded (required for resumable training). |
-| resume_training  | Specifies whether to enable resumable training. You can set it to `True` or specify a weight file name. If the value is `True`, the system automatically resumes the training from the last interruption. The default value is `False`.   |
-| load_ckpt_async | Determines whether to load model weights and compile in parallel (this configuration does not take effect when auto_trans_ckpt is set to true). The default value is False (serial execution). <br /> When it is `True`, the parallel capability of loading ckpt weights and building model is enabled to reduce the overall time  resume training. |
-
-Based on the input parameters, there are four cases.
-
-| load_checkpoint | resume_training | Description                                                                                                                                                                   | Recommended or Not|
-|-----------------|-----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------|
-| Weight file path         | True            | Resumes a training based on the weights specified by load_checkpoint.                                                                                                                                               | √         |
-| Weight file path         | Weight file name          | The file name specified by resume_training is invalid. A training is resumed based on the weights specified by load_checkpoint.                                                                                                                       | ×         |
-| Weight folder path        | True            | **Scenario 1: Single-node system, multi-node system+shared directory, or ModelArts**<br>1. Resumes the training based on the weights recorded in meta.json files and supports fault recovery.<br>2. Resumes the training based on the latest weight of all ranks if the meta.json file of any rank is missing.<br>**Scenario 2: Multi-node+non-shared directory**<br>Resumes the training based on the latest weight of all ranks.| √         |
-| Weight folder path        | Weight file name          | Resumes the training based on the weights specified by resume_training.                                                                                                                                               | √         |
-
-In addition, you can modify the following parameters in the configuration file to use related functions.
-
-| Parameter              | Description                                                                                                         |
-|------------------|-------------------------------------------------------------------------------------------------------------|
-| ignore_data_skip | Specifies whether to ignore the mechanism of skipping data during resumable training and read the dataset from the beginning instead. This parameter is used when the dataset is changed during resumable training. If this parameter is set to `True`, no data is skipped. The default value is `False`.                                    |
-| data_skip_steps  | Number of steps skipped for the dataset. This parameter is used when the training is interrupted again after being resumed because the dataset or `global batch size` is changed. You need to manually set this parameter to configure the number of steps skipped for the new dataset. If the `global batch size` is changed, you need to divide and round down its value by the scaling coefficient and then specify the result as the value of this parameter.|
-
-#### Fault Recovery Mechanism
-
-If `resume_training` is set to `True`, the system automatically resumes training based on the weights recorded in `meta.json`. If the weight file of a rank is missing or damaged, the system rolls back to the latest available weight for recovery.
-
-> In a distributed environment, resumable training requires that the weights of all nodes be in the same shared directory. You can use the `SHARED_PATHS` environment variable to set the shared path.
-
-### Example of Distributed Training
-
-The following example shows how to enable resumable training in single-device and multi-device environments. The example is based on the `llama2_7b` model.
-For related configuration files, see [configs/llama2/pretrain_llama2_7b.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/configs/llama2/pretrain_llama2_7b.yaml).
-
-#### Complete Training
-
-1. Modify `configs/llama2/pretrain_llama2_7b.yaml`.
-
-   Configure the parallelism as required.
-
-   ```yaml
-   parallel_config:
-     data_parallel: 1
-     model_parallel: 2
-     pipeline_stage: 2
-     micro_batch_num: 2
-   ```
-
-   Configure the model weight saving as required.
-
-   ```yaml
-   callbacks:
-     ...
-     - type: CheckpointMonitor
-       prefix: "llama2_7b"
-       save_checkpoint_steps: 10
-       keep_checkpoint_max: 3
-       integrated_save: False
-       async_save: False
-     ...
-   ```
-
-2. Prepare a dataset. The following uses [wikitext2](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/llama2.md#%E6%95%B0%E6%8D%AE%E5%8F%8A%E6%9D%83%E9%87%8D%E5%87%86%E5%A4%87) as an example to describe how to start four-device distributed training.
-
-   ```shell
-   bash scripts/msrun_launcher.sh "run_mindformer.py \
-       --config configs/llama2/pretrain_llama2_7b.yaml \
-       --train_dataset /path/to/wikitext2-llama2.mindrecord \
-       --run_mode train \
-       --use_parallel True" 4
-   ```
-
-   After the fourth saving is complete, end the process. The structure of the `rank_0` folder under `checkpoint` is as follows:
-
-   ```text
-   checkpoint/rank_0
-     ├── llama2_7b_rank_0-10_2.ckpt
-     ├── llama2_7b_rank_0-15_2.ckpt
-     ├── llama2_7b_rank_0-20_2.ckpt
-     └── meta.json
-   ```
-
-#### Resumable Training
-
-1. Modify the configuration and specify the resumable training weight file.
-
-   ```yaml
-   load_checkpoint: './output/checkpoint'
-   resume_training: True
-   ```
-
-2. Resume training.
-
-   ```shell
-   bash scripts/msrun_launcher.sh "run_mindformer.py \
-       --config configs/llama2/pretrain_llama2_7b.yaml \
-       --train_dataset /path/to/wikitext2-llama2.mindrecord \
-       --run_mode train \
-       --use_parallel True" 4
-   ```
-
-   If the initial number of steps is `42`, the training is resumed successfully. The saved weight file contains the information about step `40`. The default value of `sink_size` is `2`, indicating that the information is printed every two steps. Therefore, the initial number of steps is `42`.
-
-#### Resumable Training with the Dataset Changed
-
-There are three main scenarios where the dataset is changed in resumable training. You need to modify the configuration file in each scenario. The following describes each case one by one, and describes in detail which step of the basic resumable training process needs to be modified, and how to modify a specific configuration to achieve an expected effect.
-
-**Scenario 1: Training resumed with a new dataset (but not skipping trained steps)**
-
-In this scenario, when the new dataset is used, the model training starts from scratch without skipping any data or steps. In this case, you need to set the configuration file **to ignore the previous data progress** so that the model can be trained from scratch based on the new dataset.
-
-- **Configuration modification**: You need to set `ignore_data_skip` based on the first step of the basic resumable training process. Set `ignore_data_skip` to `True`, indicating that no data is skipped.
-
-   ```yaml
-   load_checkpoint: './output/checkpoint'
-   resume_training: True
-   ignore_data_skip: True
-   ```
-
-- **Expected result**: The model is trained from scratch based on the new dataset without skipping any steps.
-
-**Scenario 2: Training resumed with a new dataset, skipping trained steps**
-
-In this case, the model has been partially trained based on the new dataset (for example, `2` steps have been performed before the training is interrupted), and the training is expected to continue from the last interruption. In this case, you must manually specify the number of steps to be skipped.
-
-- **Configuration modification**: You need to set `ignore_data_skip` and `data_skip_steps` based on the first step of the basic resumable training process. Set `ignore_data_skip` to `False` and use `data_skip_steps` to specify the number of trained steps to skip (for example, `2`).
-
-   ```yaml
-   load_checkpoint: './output/checkpoint'
-   resume_training: True
-   ignore_data_skip: False
-   data_skip_steps: 2
-   ```
-
-- **Expected result**: The model skips the first `2` steps and continues the training from step `3` based on the new dataset.
-
-**Scenario 3: Training resumed with a new dataset and `global batch size` changed**
-
-If `global batch size` is changed (for example, doubled) when a training is resumed based on a new dataset, you need to scale the number of steps that have been performed when manually specifying the number of steps to be skipped. Specifically, the number of skipped steps needs to be divided and rounded down based on the scaling coefficient. For example, if the value of `global batch size` is changed to `2` times of the original value, the number of steps that need to be skipped is halved.
-
-- **Configuration modification**: Adjust `data_skip_steps` based on Scenario 2. Set `data_skip_steps` to the number of steps after scaling. For example, if `global batch size` is changed to `2` times of the original value, the number of steps to be skipped is changed to `1` (rounded down).
-
-   ```yaml
-   load_checkpoint: './output/checkpoint'
-   resume_training: True
-   ignore_data_skip: False
-   data_skip_steps: 1
-   ```
-
-- **Expected result**: The model adjusts the number of skipped steps based on the new setting of `global batch size` and continues the training from the specified position.
-
-#### Fault Recovery Example
-
-If some weight files are missing, the system automatically restores the files based on the latest available weight.
-
-1. Delete the `llama2_7b_rank_0-20_2.ckpt` file from the `rank_3` directory. The folder structure after the deletion is as follows:
-
-   ```text
-   checkpoint/rank_3
-     ├── llama2_7b_rank_0-10_2.ckpt
-     ├── llama2_7b_rank_0-15_2.ckpt
-     └── meta.json
-   ```
-
-2. Modify the configuration to enable fault recovery.
-
-   ```yaml
-   load_checkpoint: './output/checkpoint'
-   resume_training: True
-   ```
-
-3. Start distributed training.
-
-   ```shell
-   bash scripts/msrun_launcher.sh "run_mindformer.py \
-       --config configs/llama2/pretrain_llama2_7b.yaml \
-       --train_dataset /path/to/wikitext2-llama2.mindrecord \
-       --run_mode train \
-       --use_parallel True" 4
-   ```
-
-   If the initial number of steps is `32`, the training is resumed successfully. Because the weight of the information in step `40` under `rank_3` is deleted, the weight saved last time, that is, the weight of the information in step `30`, is automatically used. The default value of `sink_size` is `2`, indicating that information is printed every two steps. Therefore, the initial number of steps is `32`.
-
-### Precautions
-
-- **Data offloading**: You must enable data offloading and configure `sink_mode=True` for distributed resumable training.
-- **Weight file check**: Ensure that the weights loaded for resumable training are the ones saved when the training is interrupted instead of in the entire training process. Otherwise, an error is reported.
diff --git a/docs/mindformers/docs/source_en/function/safetensors.md b/docs/mindformers/docs/source_en/function/safetensors.md
deleted file mode 100644
index bf5819585e6bc617ddada91f916b2c5eccfbe027..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_en/function/safetensors.md
+++ /dev/null
@@ -1,242 +0,0 @@
-# Safetensors Weights
-
-[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_en/function/safetensors.md)
-
-## Overview
-
-Safetensors is a reliable and portable machine learning model storage format from Huggingface for storing Tensors securely and with fast storage (zero copies). This article focuses on how MindSpore Transformers supports saving and loading of this file format to help users use weights better and faster.
-
-## Safetensors Weights Samples
-
-There are two main types of Safetensors files: complete weights files and distributed weights files. Below are examples of how they are obtained and the corresponding files.
-
-### Complete Weights
-
-Safetensors complete weights can be obtained in two ways:
-
-1. Download directly from Huggingface.
-2. After MindSpore Transformers distributed training, the weights are generated by [merge script](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/function/transform_weight.html#safetensors-weight-merging).
-
-Huggingface Safetensors example catalog structure is as follows:
-
-```text
-qwen2_7b
- └── hf_unified_safetenosrs
-        ├── model-00001-of-00004.safetensors
-        ├── model-00002-of-00004.safetensors
-        ├── model-00003-of-00004.safetensors
-        ├── model-00004-of-00004.safetensors
-        └── model.safetensors.index.json        # Huggingface weight parameter and file storage relationship mapping json file
-```
-
-MindSpore Safetensors example catalog structure is as follows:
-
-```text
-qwen2_7b
- └── ms_unified_safetenosrs
-        ├── model-00001-of-00004.safetensors
-        ├── model-00002-of-00004.safetensors
-        ├── model-00003-of-00004.safetensors
-        ├── model-00004-of-00004.safetensors
-        ├── hyper_param.safetensors            # Hyperparameter files for training task records
-        └── param_name_map.json                # MindSpore weight parameter and file storage relationship mapping json file
-```
-
-### Distributed Weights
-
-Safetensors distributed weights can be obtained in two ways:
-
-1. Generated by distributed training with MindSpore Transformers.
-2. Using [format conversion script](https://www.mindspore.cn/docs/en/r2.6.0/api_python/mindspore/mindspore.ckpt_to_safetensors.html), the original distributed ckpt weights are changed to the Safetensors format.
-
-Distributed Safetensors example catalog structure is as follows:
-
-```text
-qwen2_7b
- └── distributed_safetenosrs
-        ├── rank_0
-            └── qwen2_7b_rank_0.safetensors
-        ├── rank_1
-            └── qwen2_7b_rank_1.safetensors
-        ...
-        └── rank_x
-            └── qwen2_7b_rank_x.safetensors
-```
-
-## Configuration Descriptions
-
-Load the relevant configurations:
-
-| Parameter names              | Descriptions                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
-| ------------------- |-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| load_checkpoint     | The path to the folder where the weights are preloaded.<br> - In case of full weights, fill in the path to the folder where the slices/individual weight files are located.<br/>Note: Huggingface safetensor weights loading is supported (currently only Llama series models are supported). During the online loading process, a copy of the converted MindSpore safetensor weights file is saved to `/output/ms_safetensors`.<br> - In case of distributed weights, they need to be stored in `model_dir/rank_x/xxx.safetensor` format, with the folder path filled in as `model_dir`.                                                                                                                                                                                                             |
-| load_ckpt_format | The format of the loaded model weights, optionally `ckpt`, `safetensors`, defaults to `ckpt`.<br/>Loading weights in `safetensors` format needs to change this configuration to `safetensors`.                                                                                                                                                                                                                                                                                                                                                                  |
-| auto_trans_ckpt | Whether to enable the online slicing function.<br/>- If loading weight is full weight:<br/>a. when `use_parallel: True`, it is judged as distributed loading, `auto_trans_ckpt: True` needs to be set synchronously to turn on online slicing. <br/>b. When `use_parallel: False`, it is judged as single card loading, you need to set `auto_trans_ckpt: False` synchronously to disable the online slicing function.<br/>- If loading weight is distributed weight:<br/>a. Without changing the original slicing strategy, you need to set `auto_trans_ckpt: False` to load directly according to the original slicing strategy.<br/>b. To change the original slicing strategy, set `auto_trans_ckpt: True` and configure `src_strategy_path_or_dir` to be the original slicing strategy file path.<br/>When the task is pulled up, the weights are merged online into full weights, which are sliced and loaded according to the parallelism strategy set in the configuration file. The online merged weights are saved in the current directory under the `/output/unified_checkpoint` file. |
-| remove_redundancy | Whether the loaded weights are de-redundant, defaults to `False`.                                                                                                                                                                                                                                                                                                                                                                                                                                 |
-
-Save the relevant configurations:
-
-| Parameter names                    | Descriptions                                                         |
-| :-------------------------- | ------------------------------------------------------------ |
-| callbacks.checkpoint_format | The format of the saved model weights, defaults to `ckpt`. Options are `ckpt` and `safetensors`. |
-| callbacks.remove_redundancy | Whether to enable de-redundancy saving when saving weights, defaults to `False`. Only `safetensors format` is supported. |
-
-## Usage Example
-
-### Examples of Pre-training Tasks
-
-Taking Llama2-7B as an example, modify the configuration item [pretrain_llama2_7b.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/configs/llama2/pretrain_llama2_7b.yaml) to confirm the weight saving format:
-
-```yaml
-callbacks:
-  - type: CheckpointMonitor
-    checkpoint_format: safetensors                  # Save weights file format
-    remove_redundancy: True                         # Turn on de-redundancy when saving weights
-```
-
-Execute the command when completed:
-
-```shell
-bash scripts/msrun_launcher.sh "run_mindformer.py \
- --config configs/llama2/pretrain_llama2_7b.yaml \
- --train_dataset_dir /{path}/wiki4096.mindrecord \
- --use_parallel True \
- --run_mode train" 8
-```
-
-After the task is executed, a checkpoint folder is generated in the mindformers/output directory, while the model files are saved in that folder.
-
-For more details, please refer to: [Introduction to Pre-training](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/usage/pre_training.html).
-
-### Examples of Fine-tuning Tasks
-
-If you use the full weighted multicard online fine-tuning, take the Qwen2-7B model as an example and modify the configuration item [finetune_qwen2_7b.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/qwen2/qwen2_7b/finetune_qwen2_7b.yaml):
-
-```yaml
-# Modified configuration
-load_checkpoint: '/qwen2_7b/hf_unified_safetenosrs' # Load weights file path
-load_ckpt_format: 'safetensors'                     # Load weights file format
-auto_trans_ckpt: True                               # This configuration item needs to be turned on for complete weights to enable the online slicing feature
-parallel_config:                                    # Configure the target distributed strategy
-  data_parallel: 1
-  model_parallel: 2
-  pipeline_stage: 1
-callbacks:
-  - type: CheckpointMonitor
-    checkpoint_format: safetensors                  # Save weights file format
-```
-
-If you use distributed weights multicard online fine-tuning, take the Qwen2-7B model as an example, modify the configuration item [finetune_qwen2_7b.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/qwen2/qwen2_7b/finetune_qwen2_7b.yaml):
-
-```yaml
-# Modified configuration
-load_checkpoint: '/qwen2_7b/distributed_safetenosrs' # Load weights file path
-load_ckpt_format: 'safetensors'                      # Load weights file format
-parallel_config:                                     # Configure the target distributed strategy
-  data_parallel: 1
-  model_parallel: 2
-  pipeline_stage: 1
-callbacks:
-  - type: CheckpointMonitor
-    checkpoint_format: safetensors                  # Save weights file format
-```
-
-Execute the command when completed:
-
-```shell
-bash scripts/msrun_launcher.sh "run_mindformer.py \
- --config research/qwen2/qwen2_7b/finetune_qwen2_7b.yaml \
- --train_dataset_dir /{path}/alpaca-data.mindrecord \
- --register_path research/qwen2 \
- --use_parallel True \
- --run_mode finetune" 2
-```
-
-After the task is executed, a checkpoint folder is generated in the mindformers/output directory, while the model files are saved in that folder.
-
-For more details, please refer to [Introduction to SFT fine-tuning](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/usage/sft_tuning.html)
-
-### Example of an Inference Task
-
-If you use complete weighted multicard online inference, take the Qwen2-7B model as an example, and modify the configuration item [predict_qwen2_7b_instruct.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/qwen2/qwen2_7b/predict_qwen2_7b_instruct.yaml):
-
-```yaml
-# Modified configuration
-load_checkpoint: '/qwen2_7b/hf_unified_safetenosrs' # Load weights file path
-load_ckpt_format: 'safetensors'                     # Load weights file format
-auto_trans_ckpt: True                               # This configuration item needs to be turned on for complete weights to enable the online slicing function
-parallel_config:
-  data_parallel: 1
-  model_parallel: 2
-  pipeline_stage: 1
-```
-
-If you use distributed weighted multicard online inference, take the Qwen2-7B model as an example, modify the configuration item [predict_qwen2_7b_instruct.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/qwen2/qwen2_7b/predict_qwen2_7b_instruct.yaml):
-
-```yaml
-# Modified configuration
-load_checkpoint: '/qwen2_7b/distributed_safetenosrs' # Load weights file path
-load_ckpt_format: 'safetensors'                      # Load weights file format
-parallel_config:
-  data_parallel: 1
-  model_parallel: 2
-  pipeline_stage: 1
-```
-
-Execute the command when completed:
-
-```shell
-bash scripts/msrun_launcher.sh "python run_mindformer.py \
---config research/qwen2/qwen2_7b/predict_qwen2_7b_instruct.yaml \
---run_mode predict \
---use_parallel True \
---register_path research/qwen2 \
---predict_data 'I love Beijing, because'" \
-2
-```
-
-The results of executing the above single-card inference and multi-card inference commands are as follows:
-
-```text
-'text_generation_text': [I love Beijing, because it is a city with a long history and culture.......]
-```
-
-For more details, please refer to: [Introduction to Inference](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/usage/inference.html)
-
-### Examples of Resumable Training after Breakpoint Tasks
-
-MindSpore Transformers supports step-level resumable training after breakpoint, which allows you to save a model's checkpoints during training and load the saved checkpoints to restore the previous state to continue training after a break in training.
-
-If you use distributed weight multicard resumable training and do not change the slicing strategy, modify the configuration item and start the original training task:
-
-```yaml
-# Modified configuration
-load_checkpoint: '/output/checkpoint'                # Load source distributed weights file path
-load_ckpt_format: 'safetensors'                      # Load weights file format
-resume_training: True                                # Resumable training after breakpoint switch
-callbacks:
-  - type: CheckpointMonitor
-    checkpoint_format: safetensors                   # Save weights file format
-```
-
-If the distributed weight multi-card training is renewed and the slicing strategy is changed, it is necessary to pass in the path of the source slicing strategy file and start the original training task after modifying the configuration items:
-
-```yaml
-# Modified configuration
-load_checkpoint: '/output/checkpoint'               # Load source distributed weights file path
-src_strategy_path_or_dir: '/output/src_strategy'    # Load source strategy file for merging source distributed weights into full weights
-load_ckpt_format: 'safetensors'                     # Load weights file format
-auto_trans_ckpt: True                               # Enable online slicing
-resume_training: True                               # Resumable training after breakpoint switch
-parallel_config:                                    # Configure the target distributed strategy
-  data_parallel: 2
-  model_parallel: 4
-  pipeline_stage: 1
-callbacks:
-  - type: CheckpointMonitor
-    checkpoint_format: safetensors                  # Save weights file format
-```
-
-In large cluster scale scenarios, to avoid the online merging process taking too long to occupy the training resources, it is recommended to [merge the complete weights](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/function/transform_weight.html#safetensors-weight-merging) with the original distributed weights file offline, and then pass it in. There is no need to pass in the path of the source slicing strategy file.
-
-For more details, please refer to: [Resumable Training](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/function/resume_training.html).
diff --git a/docs/mindformers/docs/source_en/function/transform_weight.md b/docs/mindformers/docs/source_en/function/transform_weight.md
deleted file mode 100644
index 086ee559dd8c8e477e13c6040aa94aad0aaa426c..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_en/function/transform_weight.md
+++ /dev/null
@@ -1,421 +0,0 @@
-# Distributed Weight Slicing and Merging
-
-[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_zh_cn/function/transform_weight.md)
-
-## Overview
-
-In a current distributed training and inference environment, if a pre-trained weight does not match a distributed strategy, the pre-trained weight needs to be converted to adapt to the corresponding distributed strategy. MindSpore Transformers provides a set of weight conversion tools to meet the requirements in different scenarios. This tool can be used to slice a single-device weight into multi-device weights, convert between multi-device weights, and merge multi-device weights into a single-device weight. You can select [Automatic Conversion](#automatic-conversion) or [Offline Conversion](#offline-conversion) as required so that a model can quickly switch between different distributed scenarios.
-
-In addition, MindSpore Transformers supports [LoRA Weight Merging](#lora-weight-merging) to facilitate the deployment of models fine-tuned using LoRA.
-
-## Automatic Conversion
-
-When a model loads a weight, it automatically checks whether the weight is matching the distributed slicing strategy of the current model. If they do not match, the weight is automatically converted.
-
-### Parameters
-
-Parameters in the `yaml` file related to **automatic weight conversion** are described as follows:
-
-| Parameter             | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
-| ------------------- |---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| load_checkpoint     | Absolute path or folder path of the pre-loaded weights.<br> - For a complete set of weights, set this parameter to an absolute path.<br> - For a distributed weight, set this parameter to the folder path. The distributed weight must be stored in the `model_dir/rank_x/xxx.ckpt` format. The folder path is `model_dir`.<br>**If there are multiple CKPT files in the rank_x folder, the last CKPT file in the file name sequence is used for conversion by default.**                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
-| src_strategy_path_or_dir        | Path of [the distributed strategy file](#generating-distributed-strategy) corresponding to the pre-loaded weights.<br> - If the pre-loaded weights are a complete set of weights, leave this parameter **blank**.<br> - If the pre-loaded weights are distributed and pipeline parallelism is used when the pre-loaded weights are saved, set this parameter to the **merged strategy file path** or **distributed strategy folder path**.<br> - If the pre-loaded weights are distributed and pipeline parallelism is not used when the pre-load weights are saved, set this parameter to any **ckpt_strategy_rank_x.ckpt** path.                                                                                                                                                                                                                                                                                                                                                                                              |
-| auto_trans_ckpt     | Specifies whether to enable automatic weight conversion. The value True indicates that it is enabled. The default value is False.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
-| transform_process_num | Number of processes used for automatic weight conversion. The default value is 1.<br> - If transform_process_num is set to 1, only rank_0 is used for weight conversion. Other processes wait until the conversion ends.<br> - If transform_process_num is larger than 1, **multiple processes conduct conversion**. For example, for an 8-device task, if transform_process_num is set to 2, rank_0 is used for converting the weights of slices rank_0, rank_1, rank_2, and rank_3, and rank_4 is used for converting the weights of slices rank_4, rank_5, rank_6, and rank_7, and other processes wait until rank_0 and rank_4 complete the conversion.<br>**Note**:<br> 1. A larger value of transform_process_num indicates a shorter conversion time and **a larger host memory occupied by the conversion**. If the host memory is insufficient, decrease the value of transform_process_num.<br> 2. The value of transform_process_num must be a number that can be exactly divided by and cannot exceed that of NPUs. |
-| transform_by_rank   | Specifies whether to use the mindspore.transform_checkpoint_by_rank API for weight conversion.<br> - If transform_process_num is larger than 1, the value is automatically set to `True`.<br> - If transform_process_num is set to 1, if the target weight is a distributed weight, the mindspore.transform_checkpoint_by_rank API is cyclically called to convert the weight of each rank slice in serial mode.<br>- If transform_process_num is set to 1, if the target weight is a complete weight, the value is automatically set to `False`, and the mindspore.transform_checkpoints API is called for weight conversion.                                                                                                                                                                                                                                                                                                                                                                                                  |
-
-### YAML Configurations in Different Scenarios
-
-#### Slicing a Single-Device Weight into Multi-Device Weights
-
-```yaml
-# load_checkpoint: specifies path of the pre-trained weight file.
-load_checkpoint: "/worker/llama3_8b/llama3_8b.ckpt"
-
-# auto_trans_ckpt: specifies whether to enable automatic conversion.
-auto_trans_ckpt: True
-```
-
-#### Conversion Between Multi-Device Weights
-
-```yaml
-# load_checkpoint: specifies the path of the multi-device weight folder.
-load_checkpoint: "/worker/checkpoint/llama3-8b-2layer-dp2mp2pp2"
-
-# src_strategy_path_or_dir: specifies the path of the distributed strategy file.
-src_strategy_path_or_dir: "/worker/checkpoint/llama3-8b-2layer-dp2mp2pp2/strategy/merged_ckpt_strategy.ckpt"
-
-# auto_trans_ckpt: specifies whether to enable automatic conversion.
-auto_trans_ckpt: True
-```
-
-#### Merging Multi-Device Weights into a Single-Device Weight
-
-```yaml
-# load_checkpoint: specifies the path of the multi-device weight folder.
-load_checkpoint: "/worker/checkpoint/llama3-8b-2layer-dp1mp2pp2"
-
-# src_strategy_path_or_dir: specifies the path of the distributed strategy file.
-src_strategy_path_or_dir: "/worker/checkpoint/llama3-8b-2layer-dp1mp2pp2/strategy/merged_ckpt_strategy.ckpt"
-
-# auto_trans_ckpt: specifies whether to enable automatic conversion.
-auto_trans_ckpt: True
-
-# use_parallel: Set it to False.
-use_parallel: False
-```
-
-#### Enabling Multi-Process Conversion (Optional)
-
-```yaml
-# transform_process_num: specifies the number of processes involved in the conversion.
-transform_process_num: 2
-```
-
-### Precautions
-
-- **Multi-process conversion**: Set the `transform_process_num` parameter to enable multi-process conversion. Pay attention to the memory usage. If a memory overflow occurs, you are advised to reduce the number of processes.
-
-- **Automatic weight conversion**: After this function is enabled, the system deletes the old `strategy` and `transformed_checkpoint` folders from the `output` directory and saves the output of the current task. After the conversion task is complete, you are advised to move the `strategy` and `transformed_checkpoint` folders to a user-defined directory to prevent them from being deleted by mistake in subsequent operations.
-
-- **Distributed strategy file saving**: The distributed strategy file is saved in the `output/strategy` folder. If **pipeline parallelism** is enabled, the system automatically merges all `ckpt_strategy_rank_x.ckpt` files into a `merged_ckpt_strategy.ckpt` file. If pipeline parallelism is not enabled, the MERGE operation is not performed.
-
-## Offline Conversion
-
-The offline conversion function is designed to meet your requirements for manually converting weights. With offline conversion, you can convert model weights in an independent environment. Offline conversion supports multiple weight conversion scenarios, including slicing a single-device weight into multi-device weights, converting between multi-device weights, and merging multi-device weights into a single-device weight.
-
-When using offline conversion, you can manually configure conversion parameters as required to ensure that the conversion process is flexible and controllable. This function is especially suitable for model deployment and optimization in a strictly controlled computing environment.
-
-### Parameters
-
-Parameters in the `yaml` file related to **offline weight conversion** are described as follows:
-
-| Parameter       | Description                                                                                                                                                                                                                                                                                                                                                                      |
-| ----------------- |--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| src_checkpoint | Absolute path or folder path of the source weight.<br> - For **a complete set of weights**, set this parameter to an **absolute path**.<br> - For **distributed weights**, set this parameter to the **folder path**. The distributed weights must be stored in the `model_dir/rank_x/xxx.ckpt` format. The folder path is `model_dir`.<br>**If there are multiple CKPT files in the rank_x folder, the last CKPT file in the file name sequence is used for conversion by default.**             |
-| src_strategy_path_or_dir   | Path of the distributed strategy file corresponding to the source weight.<br> - For a complete set of weights, leave it **blank**.<br> - For distributed weights, if pipeline parallelism is used, set this parameter to the **merged strategy file path** or **distributed strategy folder path**.<br> - For distributed weights, if pipeline parallelism is not used, set this parameter to any **ckpt_strategy_rank_x.ckpt** path.        |
-| dst_checkpoint | Path of the folder that stores the target weight.  |
-| dst_strategy   | Path of the distributed strategy file corresponding to the target weight.<br> - For a complete set of weights, leave it **blank**.<br> - For distributed weights, if pipeline parallelism is used, set this parameter to the **merged strategy file path** or **distributed strategy folder path**.<br> - For distributed weights, if pipeline parallelism is not used, set this parameter to any **ckpt_strategy_rank_x.ckpt** path.|
-| prefix          | Prefix name of the saved target weight. The weight is saved as {prefix}rank_x.ckpt. The default value is checkpoint_.    |
-| world_size     | Total number of slices of the target weight. Generally, the value is dp \* mp \* pp.    |
-| process_num    | Number of processes used for offline weight conversion. The default value is 1.<br> - If process_num is set to 1, **a single process is used for conversion**.<br>- If process_num is larger than 1, **multi-process conversion** is used. For example, if the target weight for conversion is the distributed weight of eight GPUs and process_num is set to 2, two processes are started to convert the weights of slices rank_0, rank_1, rank_2, and rank_3 and slices rank_4, rank_5, rank_6, and rank_7, respectively.  |
-
-### Offline Conversion Configuration
-
-#### Generating Distributed Strategy
-
-MindSpore generates a distributed strategy file (ckpt format) corresponding to the number of cards in the `output/strategy` folder after running a distributed task, which can be used in offline weight conversion.
-
-If there is currently no distributed strategy file, it can be quickly generated by setting `only_save_strategy:True` in the yaml configuration file on the basis of the original distributed training/inference task. After setting, the task will stop immediately after generating the distributed strategy file, without actually executing training or inference.
-
-#### Single-Process Conversion
-
-Use [mindformers/tools/ckpt_transform/transform_checkpoint.py](https://gitee.com/mindspore/mindformers/blob/r1.5.0/mindformers/tools/ckpt_transform/transform_checkpoint.py) to perform single-process conversion on the loaded weight.
-
-**Run the command.**
-
-```shell
-python transform_checkpoint.py \
-  --src_checkpoint /worker/checkpoint/llama3-8b-2layer/rank_0/llama3_8b.ckpt \
-  --dst_checkpoint /worker/transform_ckpt/llama3_8b_1to8/ \
-  --dst_strategy /worker/mindformers/output/strategy/
-```
-
-#### Multi-Process Conversion
-
-Use [mindformers/tools/ckpt_transform/transform_checkpoint.sh](https://gitee.com/mindspore/mindformers/blob/r1.5.0/mindformers/tools/ckpt_transform/transform_checkpoint.sh) to perform multi-process conversion on the loaded weight.
-
-**Run the command.**
-
-```shell
-bash transform_checkpoint.sh \
-  /worker/checkpoint/llam3-8b-2layer/rank_0/llama3_8b.ckpt \
-  None \
-  /worker/transform_ckpt/llama3_8b_1to8/ \
-  /worker/mindformers/output/strategy/ \
-  8 2
-```
-
-**Precautions**:
-
-- When the [transform_checkpoint.sh](https://gitee.com/mindspore/mindformers/blob/r1.5.0/mindformers/tools/ckpt_transform/transform_checkpoint.sh) script is used, `8` indicates the number of target devices, and `2` indicates that two processes are used for conversion.
-
-## Special Scenarios
-
-### Multi-Node Multi-Device Training on Physical Machines
-
-Training a large-scale model usually needs a cluster of servers. In the multi-node multi-device scenario, if there is a shared disk between servers, the automatic conversion function can be used. Otherwise, only offline conversion can be used. The following example is a training that uses two servers and 16 GPUs.
-
-#### Scenario 1: A shared disk exists between servers.
-
-If there is a shared disk between servers, you can use MindSpore Transformers to automatically convert a weight before multi-node multi-device training. Assume that `/data` is the shared disk between the servers and the MindSpore Transformers project code is stored in the `/data/mindformers` directory.
-
-- **Single-process conversion**
-
-  In single-process conversion mode, you only need to set the path of the pre-trained weight in the configuration file and enable automatic weight conversion.
-
-  **Configure the parameter.**
-
-  ```yaml
-  # Set the path of the pre-trained weight file to an absolute path.
-  load_checkpoint: "/worker/checkpoint/llama3-8b/rank_0/llama3_8b.ckpt"
-
-  # Set auto_trans_ckpt to True to enable automatic weight conversion.
-  auto_trans_ckpt: True
-
-  # Set the dataset path.
-  train_dataset: &train_dataset
-    data_loader:
-      type: MindDataset
-      dataset_dir: "/worker/dataset/wiki103/"
-      shuffle: True
-
-  # Configure the 16-device distributed strategy (for reference only).
-  parallel_config:
-    data_parallel: 2
-    model_parallel: 4
-    pipeline_stage: 2
-    micro_batch_num: 2
-    vocab_emb_dp: True
-    gradient_aggregation_group: 4
-    micro_batch_interleave_num: 1
-  ```
-
-- **Multi-process conversion (optional)**
-
-  To accelerate weight conversion, you can choose the multi-process conversion mode by setting the `transform_process_num` parameter.
-
-  **Configure the parameter.**
-
-  ```yaml
-  # Use two processes for conversion.
-  transform_process_num: 2
-  ```
-
-  **Start a task.**
-
-  Use [mindformers/scripts/msrun_launcher.sh](https://gitee.com/mindspore/mindformers/blob/r1.5.0/scripts/msrun_launcher.sh) to start the task.
-
-  ```shell
-  # First server (main node)
-  bash scripts/msrun_launcher.sh "run_mindformer.py \
-    --config {CONFIG_PATH} \
-    --run_mode train" \
-    16 8 ${ip} ${port} 0 output/msrun_log False 300
-  # Second server (subnode)
-  bash scripts/msrun_launcher.sh "run_mindformer.py \
-    --config {CONFIG_PATH} \
-    --run_mode train" \
-    16 8 ${ip} ${port} 1 output/msrun_log False 300
-  ```
-
-#### Scenario 2: No shared disk exists between servers.
-
-If there is no shared disk between servers, you need to use the offline weight conversion tool to convert the weight. The following steps describe how to perform offline weight conversion and start a multi-node multi-device training task.
-
-- **Obtain the distributed policy file.**
-
-  Before offline weight conversion, you need to obtain the distributed strategy file of each node.
-
-  **Configure the parameter.**
-
-  ```yaml
-  # Set **only_save_strategy** to **True** to obtain the distributed strategy file.
-  only_save_strategy: True
-
-  # Set the dataset path.
-  train_dataset: &train_dataset
-    data_loader:
-      type: MindDataset
-      dataset_dir: "/worker/dataset/wikitext_2048/"
-      shuffle: True
-
-  # Configure the 16-device distributed strategy (for reference only).
-  parallel_config:
-    data_parallel: 2
-    model_parallel: 4
-    pipeline_stage: 2
-    micro_batch_num: 2
-    vocab_emb_dp: True
-    gradient_aggregation_group: 4
-    micro_batch_interleave_num: 1
-  ```
-
-  The strategy file of each node is stored in the corresponding `output/strategy` directory. For example, node 0 stores the `ckpt_strategy_rank_0-7.ckpt` file, and node 1 stores the `ckpt_strategy_rank_8-15.ckpt` file. Then, you need to integrate the strategy files of all nodes on the same server to facilitate subsequent operations.
-
-- **Offline weight conversion**
-
-  On the server where all strategy files are stored, use [mindformers/tools/ckpt_transform/transform_checkpoint.py](https://gitee.com/mindspore/mindformers/blob/r1.5.0/mindformers/tools/ckpt_transform/transform_checkpoint.py) to perform offline weight conversion.
-
-  **Single-process conversion**
-
-  ```shell
-  python mindformers/tools/ckpt_transform/transform_checkpoint.py \
-    --src_checkpoint /worker/checkpoint/llama3-8b/rank_0/llama_7b.ckpt \
-    --dst_checkpoint ./output/llama3_8b_dp2mp4pp2 \
-    --dst_strategy ./output/strategy
-  ```
-
-  **Multi-process conversion (optional)**
-
-  ```shell
-  # Use two processes for conversion.
-  bash mindformers/tools/ckpt_transform/transform_checkpoint.sh \
-    /worker/checkpoint/llama3-8b/rank_0/llama_7b.ckpt \
-    None \
-    ./output/llama3_8b_dp2mp4pp2 \
-    ./output/strategy \
-    16 2
-  ```
-
-- **Copy the weights to other nodes.**
-
-  Copy the distributed weights that have been converted to respective nodes. Node 0 requires only the weights of slices from `rank_0` to `rank_7`, and node 1 requires only the weights of slices from `rank_8` to `rank_15`.
-
-- **Set the parameter.**
-
-  ```yaml
-  # Set the pre-trained weight path to model_dir, the distributed weight folder path.
-  load_checkpoint: "/worker/checkpoint/llama3_8b_dp2mp4pp2"
-
-  # Change only_save_strategy to False.
-  only_save_strategy: False
-  ```
-
-### ModelArts Training
-
-Training in ModelArts is similar to multi-node multi-device training on physical machines. Automatic weight conversion can also be enabled. You can set `auto_trans_ckpt=True` in the hyperparameters of a training task to enable automatic weight conversion and set `transform_process_num > 1` to enable multi-process conversion.
-
-**Note**: If the number of NPUs on the server node in the ModelArts resource pool is not 8, you need to set `npu_num_per_node = the number of NPUs on the node`. For example, if each node is configured with 16 NPUs, `npu_num_per_node=16` should be set.
-
-## LoRA Weight Merging
-
-### Overview
-
-The basic principle of low-rank adaptation (LoRA) is to parameterize the original model with low-rank weights. The core process of merging LoRA weights is to calculate the parameters of the LoRA branches and add them to the corresponding model parameters, which makes the parameter list of the final weight file the same as that of the original model and excludes additional LoRA parameters. This operation does not affect the inference result. Therefore, the model after merging still has the same performance as the original model during inference.
-For details about the principles and implementation of LoRA, see the following resources:
-
-- Paper: [LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685)
-- GitHub: [https://github.com/microsoft/LoRA](https://github.com/microsoft/LoRA)
-
-### Instructions
-
-Use the [LoRA weight merging script](https://gitee.com/mindspore/mindformers/blob/r1.5.0/mindformers/tools/transform_ckpt_lora.py) provided by MindSpore Transformers to merge LoRA weights as follows:
-
-```shell
-python mindformers/tools/transform_ckpt_lora.py \
-  --src_ckpt_strategy src_strategy_path_or_dir \
-  --src_ckpt_path_or_dir src_ckpt_path_or_dir \
-  --dst_ckpt_dir dst_ckpt_dir \
-  --prefix "checkpoint_" \
-  --lora_scaling lora_alpha/lora_rank
-```
-
-#### Parameters
-
-- **src_ckpt_strategy**: specifies the path of the distributed strategy file corresponding to the source weight. The file is stored in the `output/strategy/` directory by default after the training task is started. If the source is a complete set of weights, you do not need to set this parameter. If the source contains distributed weights, set this parameter based on the following conditions:
-    - **Pipeline parallelism enabled for the source weights**: Weight conversion is based on the merging strategy file. Set the parameter to the path of the distributed strategy folder. The script automatically merges all `ckpt_strategy_rank_x.ckpt` files in the folder into `merged_ckpt_strategy.ckpt` in the folder. If `merged_ckpt_strategy.ckpt` already exists, set the parameter to the path of the file.
-    - **Pipeline parallelism not enabled for the source weights**: Weight conversion can be based on any strategy file. Set the parameter to the path of any `ckpt_strategy_rank_x.ckpt` file.
-
-    **Note**: If a `merged_ckpt_strategy.ckpt` already exists in the strategy folder and is still transferred to the folder path, the script deletes the old `merged_ckpt_strategy.ckpt` and then merges files into a new `merged_ckpt_strategy.ckpt` for weight conversion. Therefore, ensure that the folder has enough write permission. Otherwise, an error will be reported.
-- **src_ckpt_path_or_dir**: specifies the path of the source weight. For distributed weights, set the parameter to the path of the folder where the source weights are located. The source weights must be stored in the `model_dir/rank_x/xxx.ckpt` format, and the folder path must be set to `model_dir`. If the source is a complete set of weights, set the parameter to an absolute path.
-- **dst_ckpt_dir**: specifies the path for storing the target weight, which must be a user-defined path of an empty folder. The target weight is saved in the `model_dir/rank_x/xxx.ckpt` format.
-- **prefix**: name prefix of the target weight file. The default value is "checkpoint_", indicating that the target weight is saved in the `model_dir/rank_x/checkpoint_x.ckpt` format.
-- **lora_scaling**: combination coefficient of the LoRA weight. The default value is `lora_alpha/lora_rank`. The two parameters are used for LoRA model configuration and need to be calculated.
-
-### Examples
-
-#### Scenario 1: There is a complete set of weights for LoRA parameters.
-
-If the weight file before merging is a complete one, you can set the parameters as follows (directly enter the path of the complete set of weights):
-
-```shell
-python mindformers/tools/transform_ckpt_lora.py \
-  --src_ckpt_path_or_dir .../xxx/xxx.ckpt \
-  --dst_ckpt_dir dst_ckpt_dir \
-  --prefix "checkpoint_" \
-  --lora_scaling lora_alpha/lora_rank
-```
-
-#### Scenario 2: There are distributed weights for LoRA parameters.
-
-If the weight file before merging contains distributed weights, you can set the parameters as follows (enter the path of the distributed weight folder and the path of the distributed strategy folder). The obtained weights are automatically merged into a complete weight file.
-
-```shell
-python mindformers/tools/transform_ckpt_lora.py \
-  --src_ckpt_strategy .../xxx/mindformers/output/strategy/ \
-  --src_ckpt_path_or_dir .../xxx/model_dir \
-  --dst_ckpt_dir dst_ckpt_dir \
-  --prefix "checkpoint_" \
-  --lora_scaling lora_alpha/lora_rank
-```
-
-## Safetensors Weight Merging
-
-### Instructions
-
-Use the [safetensors weight merging script](https://gitee.com/mindspore/mindformers/blob/r1.5.0/toolkit/safetensors/unified_safetensors.py) provided by MindSpore Transformers to perform safetensors weight merging.
-
-```shell
-python toolkit/safetensors/unified_safetensors.py \
-  --src_strategy_dirs src_strategy_path_or_dir \
-  --mindspore_ckpt_dir mindspore_ckpt_dir\
-  --output_dir output_dir \
-  --file_suffix "1_1" \
-  --has_redundancy has_redundancy
-```
-
-#### Parameters
-
-- **src_strategy_dirs**: specifies the path of the distributed strategy file corresponding to the source weight. The file is stored in the `output/strategy/` directory by default after the training task is started. Set the distributed weight based on the following conditions:
-    - **Pipeline parallelism enabled for the source weights**: Weight conversion is based on the merging strategy file. Set the parameter to the path of the distributed strategy folder. The script automatically merges all `ckpt_strategy_rank_x.ckpt` files in the folder into `merged_ckpt_strategy.ckpt` in the folder. If `merged_ckpt_strategy.ckpt` already exists, set the parameter to the path of the file.
-    - **Pipeline parallelism not enabled for the source weights**: Weight conversion can be based on any strategy file. Set the parameter to the path of any `ckpt_strategy_rank_x.ckpt` file.
-
-    **Note**: If a `merged_ckpt_strategy.ckpt` already exists in the strategy folder and is still transferred to the folder path, the script deletes the old `merged_ckpt_strategy.ckpt` and then merges files into a new `merged_ckpt_strategy.ckpt` for weight conversion. Therefore, ensure that the folder has enough write permission. Otherwise, an error will be reported.
-- **mindspore_ckpt_dir**: The path of distributed weight, please fill in the path of the folder where the source weight is located, the source weights should be stored as `model_dir/rank_x/xxx.safetensors`, and fill in the folder path as `model_dir`.
-- **output_dir**: Path for saving target weights, default value is "/new_llm_data/******/ckpt/nbg3_31b/tmp", target weights will be saved in `/new_llm_data/******/ckpt/nbg3_31b/tmp`.
-- **file_suffix**: Naming suffix of target weight file, default value is "1_1", The target weight will be searched in the format of `*1_1.safetensors`.
-- **has_redundancy**: Is the merged weights which remove redundancy, default value is `True`.
-- **filter_out_param_prefix**: Customize the parameters to be filtered out when merging weights, and the filtering rules are based on prefix name matching. For example, optimizer parameter "adam_".
-- **max_process_num**: Maximum number of processes to merge. Default value: 64.
-
-### Examples
-
-#### Scenario 1: Safetensors weights removed redundancy
-
-If merging the safetensors weights which have removed redundancy, you can set the parameters as follows:
-
-```shell
-python toolkit/safetensors/unified_safetensors.py \
-  --src_strategy_dirs src_strategy_path_or_dir \
-  --mindspore_ckpt_dir mindspore_ckpt_dir\
-  --output_dir output_dir \
-  --file_suffix "1_1" \
-  --has_redundancy True
-```
-
-#### Scenario 2: Safetensors weights did not remove redundancy
-
-If merging the safetensors weights which did not remove redundancy, you can set the parameters as follows:
-
-```shell
-python toolkit/safetensors/unified_safetensors.py \
-  --src_strategy_dirs src_strategy_path_or_dir \
-  --mindspore_ckpt_dir mindspore_ckpt_dir\
-  --output_dir output_dir \
-  --file_suffix "1_1" \
-  --has_redundancy False
-```
-
-#### Scenario 3: Safetensors weights of Adam optimizer are filtered
-
-If merge the filtered safetensors weights of Adam optimizer, you can fill in the parameters as follows:
-
-```shell
-python toolkit/safetensors/unified_safetensors.py \
-  --src_strategy_dirs src_strategy_path_or_dir \
-  --mindspore_ckpt_dir mindspore_ckpt_dir\
-  --output_dir output_dir \
-  --file_suffix "1_1" \
-  --filter_out_param_prefix "adam_"
-```
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_en/function/weight_conversion.md b/docs/mindformers/docs/source_en/function/weight_conversion.md
deleted file mode 100644
index 70a20dfb54e5400df7148f6d393809856be45702..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_en/function/weight_conversion.md
+++ /dev/null
@@ -1,124 +0,0 @@
-# Weight Format Conversion
-
-[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_en/function/weight_conversion.md)
-
-## Overview
-
-MindSpore Transformers provides a unified weight conversion tool that allows model weights to convert between the HuggingFace and MindSpore Transformers formats. This helps you:
-
-- Convert a HuggingFace weight to a MindSpore Transformers one for fine-tuning, evaluation, or inference on MindSpore Transformers.
-- Convert the weights trained or fine-tuned using MindSpore Transformers to HuggingFace weights and uses them on other frameworks.
-
-## Conversion Procedure
-
-To perform weight conversion, clone the complete HuggingFace repository of the model to be converted locally, and execute the `mindformers/convert_weight.py` script. This script automatically converts the HuggingFace model weight file into a weight file applicable to MindSpore Transformers. If you want to convert a MindSpore Transformers weight to a HuggingFace one, set `reversed` to `True`.
-
-```shell
-python convert_weight.py [-h] --model MODEL [--reversed] --input_path INPUT_PATH  --output_path OUTPUT_PATH [--dtype DTYPE] [--n_head N_HEAD] [--hidden_size HIDDEN_SIZE] [--layers LAYERS] [--is_pretrain IS_PRETRAIN] [--telechat_type TELECHAT_TYPE]
-```
-
-### Parameters
-
-- model: model name.
-- reversed: converts a MindSpore Transformers weight to the HuggingFace one.
-- input_path: path of the HuggingFace weight folder, which points to the downloaded weight file.
-- output_path: path for storing the MindSpore Transformers weight file after conversion.
-- dtype: weight data type after conversion.
-- n_head: takes effect only for the BLOOM model. Set this parameter to `16` when `bloom_560m` is used and to `32` when `bloom_7.1b` is used.
-- hidden_size: takes effect only for the BLOOM model. Set this parameter to `1024` when `bloom_560m` is used and to `4096` when `bloom_7.1b` is used.
-- layers: number of layers to be converted. This parameter takes effect only for the GPT2 and WizardCoder models.
-- is_pretrain: converts the pre-trained weight. This parameter takes effect only for the Swin model.
-- telechat_type: version of the TeleChat model. This parameter takes effect only for the TeleChat model.
-
-## Conversion Example
-
-Assume that you have downloaded the [Llama2 model weight](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/llama2.md#%E6%A8%A1%E5%9E%8B%E6%9D%83%E9%87%8D%E4%B8%8B%E8%BD%BD) and saved it in the `/home/user/torch_weights` path. To convert it to the MindSpore Transformers weight and save it in the `/home/user/ms_weights` path, run the following command:
-
-```bash
-python convert_weight.py --model llama --input_path /home/user/torch_weights --output_path /home/user/ms_weights/llama.ckpt
-```
-
-After the preceding steps are performed, the HuggingFace weight is successfully converted to a MindSpore Transformers weight, facilitating model training or inference on MindSpore Transformers.
-
-## Supported Models
-
-| Parameter Value      | Supported models                            |
-|-----------|---------------------------------------------|
-| llama     | Llama2, Llama3, Llama3.1, CodeLlama         |
-| baichuan2 | Baichuan2                                   |
-| glm-n     | GLM2, GLM3, GLM3-32K, GLM4                  |
-| cogvlm2   | CogVLM2-Video, CogVLM2-Image                |
-| qwen      | Qwen, Qwen1.5, Qwen2                        |
-| qwenvl    | QwenVL                                      |
-| internlm  | InternLM                                    |
-| internlm2 | InternLM2                                   |
-| yi        | Yi                                          |
-| mixtral   | Mixtral                                     |
-| deepseek  | DeepSeekCoder, DeepSeekCoder1.5, DeepSeekV2 |
-| gpt       | GPT2                                        |
-| whisper   | Whisper                                     |
-
-## Developing Weight Conversion for Unsupported Models
-
-1. Add the `convert_weight.py` and `convert_reversed.py` files to the extended model directory.
-2. Compile the `convert_pt_to_ms` and `convert_ms_to_pt` weight conversion functions in the files. The function parameters are `input_path`, `output_path`, `dtype`, and an additional parameter `**kwargs`.
-3. Add the extended model name and conversion function import paths to the `convert_map` and `reversed_convert_map` dictionaries in the `convert_weight.py` file in the MindSpore Transformers code root directory.
-4. Call the `parser.add_argument()` method in the `main` function to add the additional parameter.
-
-## Example of Developing Model Weight Conversion
-
-Llama is used as an example. To convert a HuggingFace weight to a MindSpore Transformers one, define the `convert_pt_to_ms` function in [convert_weight.py](https://gitee.com/mindspore/mindformers/blob/r1.5.0/mindformers/models/llama/convert_weight.py).
-
-```python
-def convert_pt_to_ms(input_path, output_path, dtype=None, **kwargs):
-    """convert hf weight to ms."""
-    print(f"Trying to convert huggingface checkpoint in '{input_path}'.", flush=True)
-    try:
-        from transformers import LlamaForCausalLM
-    except:
-        raise ImportError(f"Failed to load huggingface checkpoint. Please make sure transformers is available.")
-
-    try:
-        model_hf = LlamaForCausalLM.from_pretrained(os.path.dirname(input_path))
-    except Exception as e:
-        print(f"Do not find huggingface checkpoint in '{os.path.dirname(input_path)}', Error {e.message}.", flush=True)
-        return False
-    ckpt_list = []
-    for name, value in model_hf.state_dict().items():
-        name = name_replace(name)
-        if name == 'norm.weight':
-            name = 'norm_out.weight'
-        if name[:7] == 'layers.':
-            name = name[7:]
-
-        print(f'\rprocessing parameter: {name} {value.shape}     ', end='', flush=True)
-        ckpt_list.append({'name': name, 'data': pt2ms(value, dtype)})
-
-    ms.save_checkpoint(ckpt_list, output_path)
-    print(f"\rConvert huggingface checkpoint finished, the mindspore checkpoint is saved in '{output_path}'.",
-          flush=True)
-    return True
-```
-
-To convert a MindSpore Transformers weight to a HuggingFace one, define the `convert_ms_to_pt` function in [convert_reversed.py](https://gitee.com/mindspore/mindformers/blob/r1.5.0/mindformers/models/llama/convert_reversed.py).
-
-```python
-def convert_ms_to_pt(input_path, output_path, dtype=None, **kwargs):
-    """convert ms weight to hf."""
-    print(f"Trying to convert mindspore checkpoint in '{input_path}'.", flush=True)
-    model_ms = ms.load_checkpoint(input_path)
-
-    state_dict = {}
-    for name, value in model_ms.items():
-        name = name_replace(name)
-        print(f'\rprocessing parameter: {name} {value.shape}     ', end='', flush=True)
-        if is_lora_param(name):
-            name = name.replace('.tk_delta_lora_a', '.lora_A.weight')
-            name = name.replace('.tk_delta_lora_b', 'lora_B.weight')
-        state_dict[name] = ms2pt(value, dtype)
-
-    torch.save(state_dict, output_path)
-    print(f"\rConvert mindspore checkpoint finished, the huggingface checkpoint is saved in '{output_path}'.",
-          flush=True)
-    return True
-```
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_en/index.rst b/docs/mindformers/docs/source_en/index.rst
deleted file mode 100644
index 08d4897e46e5e5f70d1f82b5c79d152489057f84..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_en/index.rst
+++ /dev/null
@@ -1,184 +0,0 @@
-MindSpore Transformers Documentation
-=====================================
-
-MindSpore Transformers (also known as MindFormers) is a MindSpore-native foundation model suite designed to provide full-flow development capabilities for foundation model training, fine-tuning, evaluating, inference and deploying, providing the industry mainstream Transformer class of pre-trained models and SOTA downstream task applications, and covering a rich range of parallel features, with the expectation of helping users to easily realize large model training and innovative research and development.
-
-Users can refer to `Overall Architecture <https://www.mindspore.cn/mindformers/docs/en/r1.5.0/start/overview.html>`_ and `Model Library <https://www.mindspore.cn/mindformers/docs/en/r1.5.0/start/models.html>`_ to get a quick overview of the MindSpore Transformers system architecture, and the list of supported functional features and foundation models. Further, refer to the `Installation <https://www.mindspore.cn/mindformers/docs/en/r1.5.0/quick_start/install.html>`_ and `Quick Start <https://www.mindspore.cn/mindformers/docs/en/r1.5.0/quick_start/source_code_start.html>`_ to get started with MindSpore Transformers.
-
-If you have any suggestions for MindSpore Transformers, please contact us via `issue <https://gitee.com/mindspore/mindformers/issues>`_ and we will handle them promptly.
-
-MindSpore Transformers supports one-click start of single/multi-card training, fine-tuning, evaluation, and inference processes for any task, which makes the execution of deep learning tasks more efficient and user-friendly by simplifying the operation, providing flexibility, and automating the process. Users can learn from the following explanatory documents:
-
-- `Development Migration <https://www.mindspore.cn/mindformers/docs/en/r1.5.0/usage/dev_migration.html>`_
-- `Pretraining <https://www.mindspore.cn/mindformers/docs/en/r1.5.0/usage/pre_training.html>`_
-- `SFT Tuning <https://www.mindspore.cn/mindformers/docs/en/r1.5.0/usage/sft_tuning.html>`_
-- `Evaluation <https://www.mindspore.cn/mindformers/docs/en/r1.5.0/usage/evaluation.html>`_
-- `Inference <https://www.mindspore.cn/mindformers/docs/en/r1.5.0/usage/inference.html>`_
-- `Quantization <https://www.mindspore.cn/mindformers/docs/en/r1.5.0/usage/quantization.html>`_
-- `Service Deployment <https://www.mindspore.cn/mindformers/docs/en/r1.5.0/usage/mindie_deployment.html>`_
-- `Multimodal Model Development <https://www.mindspore.cn/mindformers/docs/en/r1.5.0/usage/multi_modal.html>`_
-
-Code repository address: <https://gitee.com/mindspore/mindformers>
-
-Flexible and Easy-to-Use Personalized Configuration with MindSpore Transformers
--------------------------------------------------------------------------------------------
-
-With its powerful feature set, MindSpore Transformers provides users with flexible and easy-to-use personalized configuration options. Specifically, it comes with the following key features:
-
-1. `Weight Format Conversion <https://www.mindspore.cn/mindformers/docs/en/r1.5.0/function/weight_conversion.html>`_
-
-   Provides a unified weight conversion tool that converts model weights between the formats used by HuggingFace and MindSpore Transformers.
-
-2. `Distributed Weight Slicing and Merging <https://www.mindspore.cn/mindformers/docs/en/r1.5.0/function/transform_weight.html>`_
-
-   Weights in different distributed scenarios are flexibly sliced and merged.
-
-3. `Distributed Parallel <https://www.mindspore.cn/mindformers/docs/en/r1.5.0/function/distributed_parallel.html>`_
-
-   One-click configuration of multi-dimensional hybrid distributed parallel allows models to run efficiently in clusters up to 10,000 cards.
-
-4. `Dataset <https://www.mindspore.cn/mindformers/docs/en/r1.5.0/function/dataset.html>`_
-
-   Support multiple types and formats of datasets.
-
-5. `Weight Saving and Resumable Training After Breakpoint <https://www.mindspore.cn/mindformers/docs/en/r1.5.0/function/resume_training.html>`_
-
-   Supports step-level resumable training after breakpoint, effectively reducing the waste of time and resources caused by unexpected interruptions during large-scale training.
-
-6. `Training Metrics Monitoring <https://www.mindspore.cn/mindformers/docs/en/r1.5.0/function/monitor.html>`_
-
-   Provides visualization services for the training phase of large models for monitoring and analyzing various indicators and information during the training process.
-
-7. `Training High Availability <https://www.mindspore.cn/mindformers/docs/en/r1.5.0/function/high_availability.html>`_
-
-   Provide high-availability capabilities for the training phase of large models, including end-of-life CKPT preservation, UCE fault-tolerant recovery, and process-level rescheduling recovery.
-
-8. `Safetensors Weights <https://www.mindspore.cn/mindformers/docs/en/r1.5.0/function/safetensors.html>`_
-
-   Support the function of saving and loading weight files in safetensors format.
-
-9. `Fine-Grained Activations SWAP <https://www.mindspore.cn/mindformers/docs/en/r1.5.0/function/fine_grained_activations_swap.html>_`
-
-   Support fine-grained selection of specific activations to enable SWAP and reduce peak memory overhead during model training.
-
-Deep Optimizing with MindSpore Transformers
----------------------------------------------
-
-- `Precision Optimizing <https://www.mindspore.cn/mindformers/docs/en/r1.5.0/acc_optimize/acc_optimize.html>`_
-- `Performance Optimizing <https://www.mindspore.cn/mindformers/docs/en/r1.5.0/perf_optimize/perf_optimize.html>`_
-
-Appendix
-------------------------------------
-
-- `Environment Variables Descriptions <https://www.mindspore.cn/mindformers/docs/en/r1.5.0/appendix/env_variables.html>`_
-- `Configuration File Descriptions <https://www.mindspore.cn/mindformers/docs/en/r1.5.0/appendix/conf_files.html>`_
-
-FAQ
-------------------------------------
-
-- `Model-Related <https://www.mindspore.cn/mindformers/docs/en/r1.5.0/faq/model_related.html>`_
-- `Function-Related <https://www.mindspore.cn/mindformers/docs/en/r1.5.0/faq/func_related.html>`_
-- `MindSpore Transformers Contribution Guide <https://www.mindspore.cn/mindformers/docs/en/r1.5.0/faq/mindformers_contribution.html>`_
-- `Modelers Contribution Guide <https://www.mindspore.cn/mindformers/docs/en/r1.5.0/faq/modelers_contribution.html>`_
-
-.. toctree::
-   :glob:
-   :maxdepth: 1
-   :caption: Start
-   :hidden:
-
-   start/overview
-   start/models
-
-.. toctree::
-   :glob:
-   :maxdepth: 1
-   :caption: Quick Start
-   :hidden:
-
-   quick_start/install
-   quick_start/source_code_start
-
-.. toctree::
-   :glob:
-   :maxdepth: 1
-   :caption: Usage Tutorials
-   :hidden:
-
-   usage/dev_migration
-   usage/multi_modal
-   usage/pre_training
-   usage/sft_tuning
-   usage/evaluation
-   usage/inference
-   usage/quantization
-   usage/mindie_deployment
-   usage/pretrain_gpt
-
-.. toctree::
-   :glob:
-   :maxdepth: 1
-   :caption: Function Description
-   :hidden:
-
-   function/weight_conversion
-   function/transform_weight
-   function/distributed_parallel
-   function/dataset
-   function/resume_training
-   function/monitor
-   function/high_availability
-   function/safetensors
-   function/fine_grained_activations_swap
-
-.. toctree::
-   :glob:
-   :maxdepth: 1
-   :caption: Precision Optimization
-   :hidden:
-
-   acc_optimize/acc_optimize
-
-.. toctree::
-   :glob:
-   :maxdepth: 1
-   :caption: Performance Optimization
-   :hidden:
-
-   perf_optimize/perf_optimize
-
-.. toctree::
-   :maxdepth: 1
-   :caption: API
-   :hidden:
-
-   mindformers
-   mindformers.core
-   mindformers.dataset
-   mindformers.generation
-   mindformers.models
-   mindformers.modules
-   mindformers.pet
-   mindformers.pipeline
-   mindformers.tools
-   mindformers.wrapper
-
-.. toctree::
-   :glob:
-   :maxdepth: 1
-   :caption: Appendix
-   :hidden:
-
-   appendix/env_variables
-   appendix/conf_files
-
-.. toctree::
-   :glob:
-   :maxdepth: 1
-   :caption: FAQ
-   :hidden:
-
-   faq/model_related
-   faq/func_related
-   faq/mindformers_contribution
-   faq/modelers_contribution
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_en/perf_optimize/images/cast.png b/docs/mindformers/docs/source_en/perf_optimize/images/cast.png
deleted file mode 100644
index c819d1ddfb48226447d7dfe99430c41e7df5f26a..0000000000000000000000000000000000000000
Binary files a/docs/mindformers/docs/source_en/perf_optimize/images/cast.png and /dev/null differ
diff --git a/docs/mindformers/docs/source_en/perf_optimize/images/mstx.png b/docs/mindformers/docs/source_en/perf_optimize/images/mstx.png
deleted file mode 100644
index 171c36574dbf9dc6893866f1471ecf6e47c906f9..0000000000000000000000000000000000000000
Binary files a/docs/mindformers/docs/source_en/perf_optimize/images/mstx.png and /dev/null differ
diff --git a/docs/mindformers/docs/source_en/perf_optimize/images/reshape.png b/docs/mindformers/docs/source_en/perf_optimize/images/reshape.png
deleted file mode 100644
index 6f9b5e46046b52db23b521a5bc8f0823b3139508..0000000000000000000000000000000000000000
Binary files a/docs/mindformers/docs/source_en/perf_optimize/images/reshape.png and /dev/null differ
diff --git a/docs/mindformers/docs/source_en/perf_optimize/images/silu_mul.png b/docs/mindformers/docs/source_en/perf_optimize/images/silu_mul.png
deleted file mode 100644
index e297d755b65b393819e25b62289a0a0b37d3ea96..0000000000000000000000000000000000000000
Binary files a/docs/mindformers/docs/source_en/perf_optimize/images/silu_mul.png and /dev/null differ
diff --git a/docs/mindformers/docs/source_en/perf_optimize/images/studio.png b/docs/mindformers/docs/source_en/perf_optimize/images/studio.png
deleted file mode 100644
index d902f35afed5559eb1f25a38f3227db5af9783fb..0000000000000000000000000000000000000000
Binary files a/docs/mindformers/docs/source_en/perf_optimize/images/studio.png and /dev/null differ
diff --git a/docs/mindformers/docs/source_en/perf_optimize/perf_optimize.md b/docs/mindformers/docs/source_en/perf_optimize/perf_optimize.md
deleted file mode 100644
index a10ed0628b327ce4a332f586fb4993be14bf1a53..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_en/perf_optimize/perf_optimize.md
+++ /dev/null
@@ -1,687 +0,0 @@
-# Large Model Performance Optimization Guide
-
-[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_en/perf_optimize/perf_optimize.md)
-
-## Overview
-
-This document introduces the performance tuning of large language models, detailing the basic theoretical knowledge related to performance tuning, guidance on the use of related tools and the overall idea of performance tuning, as well as case sharing. When you start to work on performance tuning of large models, you should have the basic knowledge of large models. In order to avoid dispersion, this document will not explain the basic concepts related to large models, and focus on performance tuning introduction.
-
-Performance generally includes in terms of model training performance, with the time required to complete a single end-to-end training session, given a specified model and input data. End-to-end refers to the process of completing a single-step training of an AI model, and the time is mainly composed of the following components:
-
-* Data loading time: it refers to the time for the model to load the training data and weights, including reading the data from the hardware storage device into the CPU, preprocessing the data in the CPU, and carrying the CPU data to the NPU. For some models that need to be sliced onto several NPUs, the data loading time also includes the time to broadcast from one NPU to other NPUs.
-
-* Model Forward Computation and Backward Computation Time: contains the forward data computation and the reverse data differential derivation.
-
-* Optimizer time: it refers to the model parameter update time.
-
-* Model post-processing time: it refers to after the optimizer is updated, including post-processing of data or necessary synchronization operations, usually depending on model-specific operations.
-
-* Communication time: a broad concept, including the inter-card communication elapsed time for single nodes and the inter-node communication elapsed time for multiple nodes. With the parallelization technique in MindSpore, communication and computation can usually be executed in parallel, at which time part of the communication time is masked, so we generally consider the communication time that is not masked by computation.
-
-* Scheduling time: it refers to the time it takes for the model to go from a CPU instruction to invoking the NPU kernel.
-
-Performance tuning that is, through the optimization of model algorithms, parameters, parallelism strategy and other means to reduce the time of the above parts, generally focusing on the optimization of the model forward-backward time, communication time.
-
-## Introduction
-
-### Performance Indicators
-
-Performance is usually evaluated by throughput. For the large language model, the throughput mainly looks at the number of tokens processed per card per second. The formula is as follows:
-
-$$
-Throughput = SeqLength * (sample/s/p)
-$$
-
-The result of the calculation of (sample/s/p) can be obtained directly from the log, or the corresponding fields can be obtained separately from the log and then calculated.
-
-The meaning of each field is as follows:
-
-* SeqLength: refers to the length of the sequence, for text processing, we need to convert the input text into a sequence of numbers, and then use these number sequences as input to the model. SeqLength is the length of these number sequences, which is the length of the text. During model training and inference, we need to specify a fixed SeqLength for batch processing and computation. A longer SeqLength improves the accuracy of the model, but increases computation and memory consumption, while a shorter SeqLength reduces computation and memory consumption, but may decrease the accuracy of the model.
-
-* sample: its value is equal to global_batch_size. in distributed training, the data is divided into multiple parts, and each part is sent to a different NPU for computation. The batch size on these NPUs adds up to the global batch size. The choice of global batch size is an important decision because it directly affects the training performance of the model. If the global batch size is too small, the batch size on each NPU may be too small, resulting in slower convergence of the model. If the global batch size is too large, the batch size on each NPU may be too large, resulting in either a lack of NPU memory or a decrease in the accuracy of the model. A good rule to find the optimal Batch Size is to reach the NPU's memory limit for a given data type, i.e., the Batch Size fills up the NPU memory.
-
-* s: i.e., per_step_time in seconds, refers to the time spent on each step in the training process.
-
-* p: i.e., parallel_num, data parallel dimension size.
-
-### Introduction to Parallel Feature
-
-In large model training, due to the increase of data volume and model complexity, the computational capacity of a single computing node is difficult to meet the training demand. In order to improve the training efficiency and accelerate the training process, a parallel strategy is usually used to distribute the computational tasks to multiple computational nodes.
-
-Parallelism strategies are usually classified into various parallel modes:
-
-* Data Parallelism (DP for short)
-
-* Model Parallelism (generally referred to as Tensor Parallelism, TP for short)
-
-* Pipeline Parallelism (PP for short)
-
-* Optimizer Parallelism (OP for short)
-
-* Sequence Parallelism (SP for short)
-
-* Multi-Copy Parallelism
-
-In practice, multiple parallel strategies and multiple optimizations, such as using optimizer parallelism and recomputation, are usually employed to reduce the model's use of memory and improve training efficiency. Parallel strategy design is closely related to the efficiency of the model, and it is crucial to identify one or more sets of better parallel strategies before model tuning.
-
-For details, refer to [Parallel Strategy Guide](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/function/distributed_parallel.html).
-
-For models with different parameter count specifications, the following parallel strategy can be selected:
-
-* When the model size is small (~7B), pure data parallelism + optimizer parallelism can be used, and gradient accumulation can be further turned on if memory is sufficient;
-* When the model size is moderate (~13B), pipeline parallelism can be further used and recomputation can be adjusted so that a single card video memory can support the training of the sliced model and reduce the amount of communication introduced;
-* When the model size is large, model parallelism should be turned on to reduce the memory consumption of the weights, while short sequence parallelism and multi-copy parallelism are also recommended to be turned on to improve performance;
-* When training long sequences (>=32k), long sequence parallelism and correlation features can be used to reduce the memory usage of long sequence activation values.
-
-### Recomputation
-
-MindSpore uses automatic differentiation in backward mode to automatically derive the backward diagram based on the forward diagram computation flow, and the forward and backward diagrams together form a complete computation diagram. When computing some backward operators, the results of some forward operators need to be used, resulting in the need for the results to reside in memory. Until the backward operators that depend on them have been computed, the memory occupied by the results of these forward operators will not be reused. This phenomenon pushes up the memory spikes for training, and is particularly significant in large-scale network models.
-
-To solve this problem, MindSpore provides the ability to recompute the forward operator without saving the results of the forward operator, so that this memory can be reused, and then recompute the forward operator when computing the backward operator, if the forward result is needed.
-
-Re-computation is categorized in the following two ways:
-
-* Full-recomputation
-
-  For extreme environments where memory resources are extremely limited. In this mode, all activation values are recalculated when needed, except for saving the input data, minimizing the dependence on memory. However, the corresponding amount of computation increases significantly.
-
-* Partial-recomputation
-
-  This strategy preserves activation values that take up less memory space but are more expensive to recompute, such as Cast, SiLU-Mul. At the same time, activation recomputation is performed for activation values that occupy a large amount of memory but have relatively low recomputation costs. This method achieves efficient management of memory usage while ensuring model performance.
-
-#### Cast Recomputation
-
-RMSNorm generally uses high-precision (FP32) computation, and the input needs to be converted from low-precision (FP16 or BF16) to high-precision (FP32) via Cast before computation. RMSNorm needs to save the input for reverse computation. Therefore, recomputing Cast here only saves the low-precision input of Cast instead of the high-precision input of RMSNorm, a move that reduces the memory usage of that input by half, resulting in memory savings.
-
-![cast](./images/cast.png)
-
-Performing recomputation from high precision to low precision Cast operator will result in the later operators originally only need to store the low precision memory after Cast, and after the Cast operator recomputation, they need to store the high precision memory, which will result in larger memory usage instead.
-
-#### SiLU-Mul Recomputation
-
-In FeedForward, the middle part of the memory tends to be large. SiLU and Mul recomputation is less costly, so recomputing the SiLU and Mul operators saves memory for the first inputs of MatMul and Mul of w2.
-
-![SiLU_mul](./images/silu_mul.png)
-
-### Tools Introduction
-
-#### profiler Tool
-
-MindSpore Transformers itself integrates profiling data collection with the following steps:
-
-1. Modify the configuration files
-
-   Turn on the profiling switch in the model configuration file with the following parameters to be changed:
-
-   ```yaml
-   profile: True                  # Whether to enable performance analysis tools
-   profile_start_step: 5          # Step that starts performance analysis
-   profile_stop_step: 6           # Step that ends performance analysis
-   init_start_profile: False      # Enabled when Profiler is initialized, profile_start_step will not take effect after it is enabled.
-   profile_communication: False   # Whether to collect communication performance data in multi-NPU training
-   profile_memory: True           # Collect Tensor memory data
-   mstx: True                     # Whether to enable mstx timestamp recording.
-   ```
-
-   `profile_start_step` and `profile_stop_step` determine the collection interval, because the collection takes a long time. It is not recommended to set the interval too large, and it should be set to 2 to 4 steps. Since the first step involves compilation, it is recommended to start collecting from step 3.
-
-   The parameters of profiling configuration are shown as below:
-
-   | Parameters            | Descriptions                                                                                                                                                                                                                            | Types |
-   |-----------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------|
-   | profile               | Whether to enable the performance capture tool. Default: `False`.                                                                                                                                                                       | bool  |
-   | profile_start_step    | Set the number of steps to start collecting performance data. Default: `1`.                                                                                                                                                             | int   |
-   | profile_stop_step     | Set the number of steps to stop collecting performance data. Default: `10`.                                                                                                                                                             | int   |
-   | profile_communication | Set whether communication performance data is collected in multi-device training, this parameter is invalid when using single card training. Default: `False`.                                                                          | bool  |
-   | profile_memory        | Set whether to collect Tensor memory data. Default: `True`.                                                                                                                                                                             | bool  |
-   | profile_rank_ids      | Specify rank ids to enable collecting performance data. Defaults to `None`, which means all rank ids are enabled.                                                                                                                       | list  |
-   | profile_pipeline      | Set whether to enable collecting performance data on one card of each parallel stage. Default: `False`.                                                                                                                                 | bool  |
-   | profile_output        | Set the directory of saving performance data.                                                                                                                                                                                           | str   |
-   | profile_level         | Set the collection level. Should be one of (0, 1, 2). Default: `1`.                                                                                                                                                                     | int   |
-   | with_stack            | Set whether to collect Python-side stack trace data. Default: `False`.                                                                                                                                                                  | bool  |
-   | data_simplification   | Set whether to enable data simplification, which will delete the FRAMEWORK directory and other extraneous data after exporting performance data. Default: `False`.                                                                      | int   |
-   | init_start_profile    | Set whether to turn on collecting performance data when the Profiler is initialized; this parameter does not take effect when `profile_start_step` is set. This parameter needs to be set to `True` when `profile_memory` is turned on. | bool  |
-   | mstx                  | Set whether to enable mstx timestamp recording, including training step, HCCL-operators and etc. Default: `False`.                                                                                                                      | bool  |
-
-2. View Data
-
-   By default, the collection tool creates a `profile` folder under the `./output` path, which can be set via the `profile_output` or `output_dir` field of the model's yaml configuration file, and the former has higher priority.
-
-   The generated file and its introduction refer to [Introduction to profile file](https://www.mindspore.cn/tutorials/en/r2.6.0/debug/profiler.html), which mainly collects information such as running time of operators and tasks, CPU utilization and memory consumption for performance tuning analysis.
-
-   In addition, it can also analyze the performance between different ranks in the cluster by counting the computation time, communication time, and unmasked communication time of each rank in the cluster, so as to determine whether there exists an unbalanced computation load, which affects the overall efficiency of the cluster, and carry out targeted optimization.
-
-3. View mstx timestamp
-
-   The collection tool does not generate files of mstx information directly, so it need to be extract from `profile` folder manually via command line. Taking the first device for example, the corresponding directory structure is shown below:
-
-   ```sh
-   output
-   └── profile
-       └── rank_0
-           └── {hostname}_{pid}_{timestamp}_ascend_ms
-               └── PROF_{number}_{timestamp}_{string}
-   ```
-
-   Execute the command below:
-
-   ```shell
-   msprof --export=on --output={path}/output/profile/rank_0/{hostname}_{pid}_{timestamp}_ascend_ms/PROF_{number}_{timestamp}_{string} # replace with the real path
-   ```
-
-   A `mindstudio_profiler_output` folder will be generated under PROF_{number}_{timestamp}_{string} directory after command is over, and the file named `msprof_tx_{timestamp}.csv` records mstx information, containing timestamp and description of training steps, HCCL-operators, etc., as shown in the figure below:
-
-   ![mstx](./images/mstx.png)
-
-#### DryRun Memory Evaluation Tools
-
-Current memory evaluation tools mainly use MindSpore dryrun. The simulated compilation is described in MindSpore [Environment Variables Documentation](https://www.mindspore.cn/docs/en/r2.6.0/api_python/env_var_list.html) and [msrun Documentation](https://www.mindspore.cn/tutorials/en/r2.6.0/parallel/msrun_launcher.html). The training process for simulation compilation can be pulled up by enabling the environment variable `export MS_SIMULATION_LEVEL=1` before the training process starts or by configuring the `-sim_level` function in the msrun startup item.
-
-DryRun can be used to analyze whether the required memory exceeds the maximum available memory. If it exceeds, the configuration needs to be readjusted. The maximum available memory can be configured using the following fields, the recommended value is `58GB`. If it is set too large, it may cause other components to run out of memory. Typically the larger the cluster training size used, the larger the memory usage of the other components, and the lower the maximum memory available to the MindSpore process. For example on a thousand card cluster, this maximum available memory value is typically set to `54GB`.
-
-```yaml
-context:
-  max_device_memory: "58GB"
-```
-
-Create a new script `dry_run.sh` with the following contents:
-
-```shell
-#!/bin/bash
-
-YAML_FILE=$1
-RANK_SIZE=$2
-PIPELINE_STAGES=$3
-RANK_GAP=$((RANK_SIZE/PIPELINE_STAGES))
-ROOT_PATH=`pwd`
-
-export MS_SIMULATION_LEVEL=1
-export RANK_SIZE=$RANK_SIZE
-
-rm -rf output_dryrun
-mkdir output_dryrun
-for((i=0; i<$PIPELINE_STAGES; i++))
-do
-    export DEVICE_ID=$i
-    export RANK_ID=$((i*RANK_GAP))
-    echo "start training for rank $RANK_ID, device $DEVICE_ID"
-    # The run_mindformer.py path needs to be specified correctly
-    python ./run_mindformer.py --config $ROOT_PATH/$1 &> ./output_dryrun/rank_$RANK_ID.log &
-done
-```
-
-Execute the script:
-
-```shell
-bash dry_run.sh $train.yaml $rank_size $stage
-```
-
-The meanings of the three parameters are as follows:
-
-* $train.yaml: configuration file to be debugged
-* $rank_size: number of simulation cards
-* $stage: number of stages, equal to the number of pipeline parallels
-
-After execution is complete, log messages for each stage are generated in the output directory `output_dryrun`, and the following message is printed at the end of each log:
-
-```text
-Device MOC memory size: 62432M
-MindSpore Used memory size: 59392M
-MindSpore memory base address: 0
-Used peak memory usage (without fragments): 48874M
-Actual peak memory usage (with fragments): 48874M
-```
-
-Used peak memory usage (without fragments): Indicates peak NPU memory usage without fragmentation, focus on this value and recommend not exceeding the maximum available memory.
-
-Actual peak memory usage (with fragments): Indicates peak NPU memory usage with fragmentation.
-
-Notes:
-
-1. When using `dryrun` to simulate compilation, if the dataset is too large, it will lead to a long run time, so you need to control the dataset size, just run through a few steps;
-2. In the pipeline parallel scenario, each PP stage requires different memory during the training process, so at least one rank is needed for each stage for dryrun. In other words, the memory situation of all the ranks within the same PP stage is exactly the same, and the overall memory situation can be analyzed by running the simulation compilation of only one rank;
-3. The `dryrun` task also generates distributed policy files. Starting the `dryrun` task generates the policy files for each PP stage. Since the distributed policy files for the same stage are exactly the same, you only need to get one policy file per PP stage;
-4. The size of memory consumed by the current task will be printed in the log at the end of the run. Memory usage can be evaluated based on this information for memory tuning.
-
-#### MindStudio Insight
-
-MindStudio Insight provides multiple presentations of performance data, including visual presentations of Timeline views, communication analysis, computational elapsed time, so that users can analyze potential performance bottlenecks and provide guidance on how to take steps to eliminate or reduce them. MindStudio Insight supports viewing data exported by Profiling in Timeline View for cluster scenarios and displaying it in a single-card dimension, and can support cluster performance file analysis of more than 20GB.
-
-Click [MindStudio Insight download link](https://www.hiascend.com/developer/download/community/result?module=pt+sto+cann) and select the appropriate version to install.
-
-Open MindStudio Insight, click the "+" in the toolbar at the top left of the interface, select the file or directory to be parsed and exported in the pop-up window, and then click “Confirm” to import.
-
-MindStudio Insight tool presents the full process of online inference, training process in the form of a Timeline, and in accordance with the scheduling process to present the overall operating conditions, and the tool supports cluster Timeline display. By analyzing the timeline, users can analyze the online inference/training process at a fine-grained level, such as whether the iteration gap is too long, operator execution time, and provide easy-to-use features to assist users to quickly locate performance bottlenecks.
-
-The Timeline interface consists of four parts: the toolbar (Area I), the timeline tree (Area II), the graphical pane (Area III), and the data pane (Area IV), as shown in the figure.
-
-![studio](./images/studio.png)
-
-* Area I
-
-  The toolbar, which contains frequently used buttons, from left to right, is Marker List, Filter (supports filtering the display by card or by special layer), Search, Link Events, Recovery, Timeline Zoom Out and Timeline Zoom In.
-
-* Area II
-
-  Timeline tree diagram showing the hierarchical information of each “Card” in the cluster scenario, with “Card” at the first level, process or specialization hierarchies at the second level, and threads at the third level. This includes upper application data (containing elapsed time information of upper application arithmetic), CANN layer data (containing elapsed time data of AscendCL, GE, and Runtime components), underlying NPU data (containing elapsed time data and iteration trajectory data of each Stream task flow under Ascend Hardware, HCCL and Overlap Analysis communication data, and other Rise AI processor system data), hitpoint data, and the AI Core Freq hierarchy.
-
-* Area III
-
-  The graphical pane, which displays data within an iteration, corresponds to a timeline tree diagram, which provides a row-by-row graphical presentation of the timeline, including the execution sequence and execution duration of the upper-level application operators, components and interfaces.
-
-* Area IV
-
-  Data pane, statistical information or operator detail information display area, Slice Detail for detailed information on selected individual operators, Slice List for a list of operators in the selected area of a lane, and System View for a summary of operators in a category.
-
-Click anywhere on the timeline page tree or graphical pane can be performed using the W (zoom in), A (move left), S (zoom out), and D (move right) keys in the keyboard, which support zooming in with a maximum precision of 1ns. This tool can provide overview, memory, arithmetic, communication and other dimensions of analysis to assist in performance tuning. Refer to [MindStudio Insight User Guide](https://www.hiascend.com/document/detail/zh/mindstudio/70RC3/msinsightug/msascendinsightug/Insight_userguide_0002.html) for detailed usage.
-
-#### IR Graph
-
-In the [MindSpore Transformers configuration file](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/appendix/conf_files.html), just turn on save_graphs, and the runtime will output some intermediate files ending with the .ir suffix generated during the graph compilation process, which we call IR files. By default, a directory of graphs will be generated in the current task execution directory, and all IR graphs will be saved in this. It is a relatively intuitive and easy to understand document describing the structure of the model in text format, which can be viewed directly with text editing software. Refer to [Config Configuration Description](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/appendix/conf_files.html) for the meaning of the configuration items, and the configuration method is as follows:
-
-```yaml
-context:
-  save_graphs: True
-  save_graphs_path: "./graph"
-```
-
-An excerpt of some of the IR graph:
-
-```text
-  %13(equiv_180_CNode_16165) = Load(%para6_model.layers.0.attention.wq.weight, UMonad[U]) cnode_attrs: {checkpoint: Bool(1)} cnode_primal_attrs: {unique_id: "782039"}
-      : (<Ref[Tensor[Float16]], (512, 4096), ref_key=model.layers.0.attention.wq.weight>, <UMonad, NoShape>) -> (<Tensor[Float16], (512, 4096)>)
-      # Fullname with scope: (Default/network-MFPipelineWithLossScaleCell/network-_VirtualDatasetCell/_backbone-GradAccumulationCell/network-LlamaForCausalLM/model-LlamaModel/layers-CellList/0-LLamaDecodeLayer/attention-LLamaAttention/Load-op0)
-  %14(equiv_16877_x) = PrimFunc_MatMul(%12, %13, Bool(0), Bool(1)) {instance name: matmul} primitive_attrs: {in_strategy: ((1, 1), (8, 1))} cnode_attrs: {checkpoint: Bool(1)} cnode_primal_attrs: {unique_id: "782146", origin_output_shape: (4096, 4096), micro: I64(0), origin_input_shapes: ((4096, 4096), (4096, 4096))} {in_strategy: ((1, 1), (8, 1))}
-      : (<Tensor[Float16], (4096, 4096)>, <Tensor[Float16], (512, 4096)>, <Bool, NoShape>, <Bool, NoShape>) -> (<Tensor[Float16], (4096, 512)>)
-      # Fullname with scope: (Default/network-MFPipelineWithLossScaleCell/network-_VirtualDatasetCell/_backbone-GradAccumulationCell/network-LlamaForCausalLM/model-LlamaModel/layers-CellList/0-LLamaDecodeLayer/attention-LLamaAttention/wq-Linear/MatMul-op0)
-  %15(equiv_16876_CNode_30913) = PrimFunc_Reshape(%14, (I64(1), I64(4096), I64(4), I64(128))) {instance name: reshape} cnode_attrs: {checkpoint: Bool(1)} cnode_primal_attrs: {unique_id: "817859", forward_comm_node_unique_id: "729440", micro: I64(0)}
-      : (<Tensor[Float16], (4096, 512)>, <Tuple[Int64*4], TupleShape(NoShape, NoShape, NoShape, NoShape), elements_use_flags={[const vector]{1, 1, 1, 1}}>) -> (<Tensor[Float16], (1, 4096, 4, 128)>)
-      # Fullname with scope: (Default/network-MFPipelineWithLossScaleCell/network-_VirtualDatasetCell/_backbone-GradAccumulationCell/network-LlamaForCausalLM/model-LlamaModel/layers-CellList/0-LLamaDecodeLayer/attention-LLamaAttention/Reshape-op0)
-  %16(equiv_16875_query) = PrimFunc_Transpose(%15, (I64(0), I64(2), I64(1), I64(3))) {instance name: transpose} primitive_attrs: {in_strategy: ((1, 1, 8, 1))} cnode_attrs: {checkpoint: Bool(1)} cnode_primal_attrs: {unique_id: "782042", micro: I64(0)} {in_strategy: ((1, 1, 8, 1))}
-      : (<Tensor[Float16], (1, 4096, 4, 128)>, <Tuple[Int64*4], TupleShape(NoShape, NoShape, NoShape, NoShape), elements_use_flags={[const vector]{1, 1, 1, 1}}>) -> (<Tensor[Float16], (1, 4, 4096, 128)>)
-      # Fullname with scope: (Default/network-MFPipelineWithLossScaleCell/network-_VirtualDatasetCell/_backbone-GradAccumulationCell/network-LlamaForCausalLM/model-LlamaModel/layers-CellList/0-LLamaDecodeLayer/attention-LLamaAttention/Transpose-op0)
-```
-
-`%XX` indicates the step, followed by the name of the operator, and the parentheses contain the inputs and outputs, while Fullname with scope contains the completed class, method name, and so on.
-
-* `%13`
-
-  This step loads wq.weight directly and gets <Tensor[Float16], (512, 4096)>.
-
-* `%14`
-
-  MatMul with the previous %12 output and the %13 output above to get <Tensor[Float16], (4096, 512)>.
-
-* `%15`
-
-  Reshape with the 14% output above to get <Tensor[Float16], (1, 4096, 4, 128)>.
-
-* `%16`
-
-  Transpose with the 15% output above to get <Tensor[Float16], (1, 4, 4096, 128)>.
-
-It is recommended to change the number of layers of the model to a smaller size when saving IR graph, to reduce the time of compiling and saving graph, and to facilitate fast debugging. For details, please refer to [Introduction to IR file](https://www.mindspore.cn/tutorials/en/r2.6.0/debug/error_analysis/mindir.html#ir-introduction) and [Analysis samples](https://www.mindspore.cn/tutorials/en/r2.6.0/debug/error_analysis/mindir.html#how-to-derive-the-cause-of-the-failure-based-on-the-analyze-fail-ir-file-analysis-graph).
-
-#### SAPP Automatic Load Balancing Tool
-
-Large model training performance tuning requires simultaneous consideration of multi-dimensional hybrid parallel strategy configurations and memory constraints, and engineers need to try different combinations of schemes on the cluster to find a parallel strategy that achieves the required performance, and the process often takes weeks and consumes a lot of arithmetic costs.
-
-MindSpore provides SAPP (Symbolic Automatic Parallel Planner) automatic load balancing tool. Inputting the model memory and time information, as well as some of the pipeline parallel performance-related hyper-references (e.g., the impact of recomputation on performance), the tool will construct the linear programming problem by itself, through the global solution, automatically generate stage-layer ratios in the pipeline parallel for the large model, adjust the recalculation strategy of each layer, automatically optimize the cluster arithmetic power and memory utilization, reduce the idle waiting time, realize the Pipeline parallel minute-level strategy optimization, greatly reduce the performance tuning cost, and significantly improve the end-to-end training performance.
-
-For detailed usage, please refer to [SAPP Pipelined Load Balancing](https://gitee.com/mindspore/mindformers/tree/r1.5.0/toolkit/pipeline_balance) tool introduction.
-
-## Overall Concept
-
-The performance optimization method for large models mainly relies on profiling data analysis as well as memory analysis to analyze the current performance bottlenecks and make targeted optimization actions, then verify the performance gains and analyze further optimization directions. The overall tuning process is as follows:
-
-1. Analyze the profiling data to see if there are operators with significantly abnormally high time consumption, if so, try to replace the equivalent operator and submit the time consumption information of the abnormal operator to issue for feedback;
-2. Analyze the communication time consumption to see if there exists a more optimal distributed strategy, look at the IR graph to analyze if there exists an unreasonable rearranging problem, and solve these problems affecting the communication efficiency in order to improve the training efficiency of the whole cluster;
-3. Analyze memory usage to see if there is an abnormally large memory Tensor, whether there is a fusible operator to reduce the activation value memory. In the case of sufficient memory, the configuration strategy of recomputation can be adjusted to select, the use of spare memory in exchange for training performance, or reduce the number of copies of the model slices to reduce the communication overhead brought by the model slices to improve performance.
-
-Performance optimization is a cyclic process, after the performance of the operator is not obviously abnormal, we can test and analyze the distributed strategy to optimize the abnormal communication time and rearranging overhead; then we can optimize and analyze the memory to eliminate the abnormal large memory Tensor; after completing the memory optimization, we need to further check whether the free memory supports to re-adjust the parallel strategy settings to get the strategy with smaller communication overhead and make full use of the memory in exchange for better performance. This cycle of optimization leads to a step-by-step achievement of the set performance goals.
-
-After completing a round of performance optimization, it is also necessary to ensure that the model accuracy is aligned, and apply this optimization strategy if it is aligned.
-
-## Bottleneck Analysis and Optimization
-
-After clarifying the overall tuning idea, we can analyze the performance bottlenecks of the training model through performance analysis tools and memory evaluation tools, and apply optimization measures to the bottlenecks, verify the benefits, and analyze new bottlenecks for further optimization, so as to approach the optimal solution of the model training performance step by step. The following is a list of common performance bottlenecks and the corresponding optimization measures available.
-
-### Memory Bottleneck
-
-Memory bottleneck is the first problem that needs to be solved in large model training scenarios; with the expansion of model size, the memory resources required for training large models also rise, and the memory capacity provided by a single card is limited, so it is necessary to solve the problem of insufficient memory by combining recomputation, optimizer parallelism, and other means through the distributed parallelism strategy, and slicing the resources required for model training on a multi-card cluster.
-
-Optimizations for memory bottleneck scenarios are listed below:
-
-* **Model Parallel(MP)/Tensor Parallel(TP)**:
-    * Applicable scenarios: large number of model parameters, need to reduce the weight of a large number of memory-consuming scenarios;
-    * Benefits: the most reduction in memory usage is achieved by using multiple cards to slice the model weights;
-    * Overhead: use more hardware resources and introduce a lot of communication overhead;
-    * Usage recommendation: it is recommended to use it on models with more than 20B parameters and limited to 8 to avoid generating cross-machine communication overhead.
-* **pipeline Parallel(PP)**:
-    * Applicable scenarios: Scenarios where static memory can't fit in model weights, optimizer state, etc;
-    * Benefits: The communication overhead is much smaller than MP using the multi-card slice modeling phase;
-    * Overhead: Introduces computational bubble (bubble), and a smaller inter-stage communication overhead;
-    * Usage recommendation: Any scenario where the weights need to be sliced can be attempted to use it and reduce bubble performance loss through hyperparameter tuning.
-* **Long Sequence Parallel(CP)**:
-    * Applicable scenarios: Training long sequence tasks (>=32k) with high activation value scenarios;
-    * Benefits: Long sequence training scenarios apportion activation value overheads, making it possible to expand long sequence capabilities by expanding machine resources;
-    * Overhead: Introduce communication overhead.
-
-All the above three parallel strategies use more computing devices to share memory consumption to solve the memory bottleneck problem. The cost is that it requires more hardware resources and introduces additional communication, and the training throughput is not as good as data-parallel training on a cluster of the same size.
-
-* **Optimizer Parallel**:
-    * Applicable scenarios: In scenarios with data-parallel DP, the model weights and optimizer states are sliced to each card in the DP domain, dramatically reducing video memory consumption;
-    * Benefits: Model weights and optimizer states are sliced within the DP domain, saving significant memory usage;
-    * Overhead: The calculation introduces a certain amount of communication to accomplish weight aggregation;
-    * Usage recommendation: Turning it on is recommended in most cases, and the saved video memory can be used to adjust the parallel slicing strategy to improve performance overall.
-* **[Full Recomputation & Selective Recomputation](#recomputation)**:
-    * Applicable scenarios: After the slicing strategy is determined, the memory usage is still partially exceeded, the full recomputation & selective recomputation strategies can be adjusted to further optimize the memory usage;
-    * Benefits: Save memory usage;
-    * Overhead: The computation time grows further;
-    * Usage recommendation: Prioritize the use of selective recomputation and control the computational overhead from recomputation as much as possible when not exceeding memory usage.
-* **Short Sequence Parallel**:
-    * Applicable scenarios: Under MP slicing, short sequence parallelism is enabled, and the sequence dimension is sliced by MP at LayerNorm, with the communication volume remaining unchanged, reducing the activation value memory and the Norm part of the computation;
-    * Benefits: Save memory usage and computation time without increasing communication and requiring additional card count resources;
-    * Usage recommendation: It is recommended to turn it on in all MP scenarios.
-
-### Computing Length Bottleneck
-
-Under normal cases, the computation time should be mainly focused on computation-intensive operators such as matmul, flash attention, etc. If the computation operators with abnormal time consuming are found to cause performance bottlenecks in profiling analysis, we can try to replace the equivalent operators, and synchronize the submission of operator performance issue to MindSpore Transformers or MindSpore.
-
-At the model tuning level, the following methods can be tried to solve the problem of alleviating the computational length bottleneck:
-
-* **Fusion Operator Replacement**:
-    * The use of fusion operators equivalently replaces partial combinations of operators, and fusion operators typically result in performance and memory gains.
-* **Recomputation & Selective Recomputation**:
-    * Involving a balanced trade-off between time and space, reducing the number of recomputation layers can effectively utilize free memory to improve computational performance when free memory is available.
-
-### Unmasked Communication Bottleneck
-
-The communication time share of the training process can be obtained through the profiling tool, which includes masked and unmasked communication. Masked communication and computation are executed at the same time, which does not affect the training efficiency, while unmasked communication causes computation to wait for the communication, which is too time-consuming and will affect the training performance, and needs to be optimized.
-
-* **IR Graphs Analyze Redundant Communication Operators**:
-  Analyze the distribution of communication operators during the model forward process by configuring the environment variable `export MS_DEV_SAVE_GRAPHS=1`, saving the training IR graph, and seeing if it meets expectations;
-  If there is a sequence of communication operators at unreasonable locations, it is likely that the operator slicing strategy configured in the model is incorrect, resulting in triggering tensor rearrangement, and the framework automatically inserts a larger number of communication operators to ensure computational equivalence;
-  This part of the redundant communication introduced due to communication rearrangement is likely to lead to the emergence of a large number of unmasked communications, resulting in a performance bottleneck, the solution is to modify the shard policy of the corresponding location operator to configure correctly, to solve the problem of communication rearrangement.
-* **Multi-copy & Fine-grained Multi-copy Parallel**:
-  After analyzing and solving the communication rearrangement problem, if there are still a high number of unmasked communications, try using a multicopy or fine-grained multicopy parallel strategy;
-  In model parallel scenarios, enabling multicopy or fine-grained multicopy parallel, communication time and computation time can be partially masked from each other, thus reducing communication bottlenecks.
-
-### IO Bottleneck
-
-IO efficiency can be a performance bottleneck for model training only under certain circumstances, i.e., the time it takes for IO to read the training data required for a step is greater than the time it takes to reverse all computational communication before completing a step. Since the data reading process is asynchronous with the training process, as long as the IO speed is greater than the training speed, each time the next step of training can ensure that the training data is ready, the IO will not block the training process; on the contrary, when the IO speed is greater than the training speed, each time the next step of training, we need to wait for the training data to be ready. This part of the blocking time is counted in the overall time of training, which becomes a performance bottleneck.
-
-This kind of IO bottleneck usually occurs in the scenario of shared storage of large clusters, where multiple training processes of large clusters jointly access the same shared storage, resulting in the rise of IO pressure and the reduction of efficiency. The IO bottleneck is manifested in Profiling as on the timeline, there is a large data read gap between each step, during which the computation is idle.
-
-The idea of solving IO bottlenecks is to optimize the amount of IO and IO behavior.
-
-**full_batch=false**:
-
-full_batch is a control item for the data aggregation behavior of MindSpore. When configured to true, each card takes the global batch size amount of data, and then completes the slicing of the data within the graph, taking only the required data in the corresponding DP domain for training. This approach leads to steep pressure on IO in large-scale clusters, where there is DP-fold redundancy in the amount of IO read by each card, which occurs on each card and aggregates to overstress the shared storage, affecting IO performance. It is recommended to change the behavior mode to full_batch=false when encountering IO bottlenecks, which has been verified to be able to optimize the IO efficiency in a more obvious way, and the configuration mode can be referred to MindSpore[set_auto_parallel_context interface](https://www.mindspore.cn/docs/en/r2.6.0/api_python/mindspore/mindspore.set_auto_parallel_context.html#mindspore.set_auto_parallel_context). yaml example is listed below:
-
-```yaml
-#yaml file configuration
-parallel:             # In parallel module
-  ...
-  full_batch: False   # Set full batch to False
-  dataset_strategy: [[dp, 1], [dp, 1]] # dp replaced with actual dp configuration number
-  ...
-```
-
-Among them, two [dp, 1] in `dataset_strategy` array correspond to [bs, seq_len] dimensions of two inputs, and need to be configured according to the number of inputs of the dataset and the actual situation of the shape, the dp cut corresponds to the bs dimension.
-
-You can also optimize the amount of IO by starting with the dataset, which should minimize the space complexity, e.g., input items like `attention_mask`, which has a space complexity of O(N^2), are less suitable to be dropped directly into storage. This can be done by reading other relevant information with less spatial complexity and utilizing the cpu to generate it during the process of reading data by the training process in order to reduce the amount of IO accesses and speed up the data reading overall.
-
-### Too Many Bubbles in the pp Scenario
-
-The main overhead in the pipeline scenario is the introduction of computational idleness (bubble), which is roughly estimated as $bubble\ ratio=\frac{p-1}{m+p-1}$, where $p$ is the number of pipeline stages and $m$ is the set micro batch num.
-
-In order to reduce the bubble idle, we can start from the formula, in the case of a fixed number of stage, we can increase the micro batch num, so that the overall percentage of bubble is reduced, which can effectively improve the training efficiency.
-
-However, in some training scenarios, global batch size is a more critical training hyperparameter, which may not be able to be adjusted arbitrarily. In this case, we can try to optimize the bubble ratio by using the pp interleave feature.
-
-**Pipeline Interleaving**:
-
-pipeline_interleave(virtual pipeline) official website configuration description:[set_auto_parallel_context](https://www.mindspore.cn/docs/en/r2.6.0/api_python/mindspore/mindspore.set_auto_parallel_context.html?highlight=pipeline_interleave).
-
-In MindSpore Transformers, turning on multi-stream interleaving needs to be configured in parallel, e.g. using 1f1b scheduling:
-
-```yaml
-parallel:
-  ...
-  pipeline_config:
-    pipeline_interleave: True
-    pipeline_scheduler: '1f1b'
-  ...
-```
-
-After that, configure pp_interleave_num in model_config, e.g. configure it to 2 as per the following yaml:
-
-```yaml
-model:
-  model_config:
-    ...
-    pp_interleave_num: 2
-    ...
-```
-
-Benefits: The formula for the bubble share in the pp interleave scenario is $bubble\ ratio=\frac{p-1}{vm+p-1}$, where $v$ is the configured pp_interleave_num, and it can be found from the formula that increasing $v$ also achieves the effect of reducing the bubble share.
-
-Overhead: The pp interleave algorithm theoretically uses more memory, a space-for-time strategy, and its use requires readjustment of the memory usage strategy according to memory changes.
-
-### Load Balance Policy Tuning
-
-In distributed training, the pipeline parallel strategy involves the phenomenon of load unevenness among different cards.
-
-Under pipeline parallelism, because the model is sliced into stages by layer, the first and last stages design modules outside the layer to realize, such as embedding, head, loss calculation and other modules, so that the computation time of the first and last stages is higher than that of the middle stage, which is the load imbalance in time. And due to the pipeline flow execution before the reverse characteristics that the earliest execution stage, the latest all the memory release, the memory consumption of different stages is different. The more front stage consumes more memory, which is spatial imbalance.
-
-In this case you can manually adjust the number of load layers between individual stages by configuring the model layer offset offset.
-
-For example, in the scenario where PP stage is 4 and the first stage consumes too much memory, you can set `offset:[-2, 1, 1, 0]` to put the two layers of load from stage 0 on stage 1 and stage 2 respectively, which reduces the space consumption of the first stage, and at the same time, the computational load is shifted from the limitation of first and last stages to the extra layer on the middle stage, which also does not reduce the computational efficiency too much.
-
-Try not to allocate too many layers on a stage, otherwise it will form a short-board stage of computational efficiency and slow down the whole training process. A more fine-grained load balance adjustment can be made in conjunction with the utilization of memory space by recomputation.
-
-It is recommended to try using the [Automatic Load Tool](#sapp-automatic-load-balancing-tool) to get an optimal load balancing policy configuration.
-
-## Typical Case
-
-### Silu-Mul Recomputation Not in Effect
-
-Performing recomputation on Silu and Mul saves memory when fine-grained multicopy is on, but doing recomputation on Silu and Mul does not save memory when fine-grained multicopy is off. The localization process is as follows:
-
-1. Confirmation that recomputation is configured
-
-   Check if the Cast, Silu and Mul operators have the label "recompute: Bool(1)" in the IR graph. If they do, it means that the operators are equipped with recompute.
-
-2. Checking for recomputation operators
-
-   Check if the Cast, Silu and Mul operators have the label duplicated in IR graphs. The absence of labeled operators indicates that the actual computational graph does not recompute this part of the operator. Only Cast operator is with duplicated label in the following example.
-
-   ```text
-   %1834(CNode_108839) = PrimFunc_Cast(%1833, I64(43)) {instance name: cast} primitive_attrs: {output_names: [output], input_names: [x, dst_type], recompute: Bool(1)} cnode_attrs: {recompute_sub_graph: U64(64), recompute_id: I64(65), duplicated: Bool(1), need_cse_after_recompute: Bool(1)} cnode_primal_attrs: {micro: I64(0)}
-       : (<Tensor[Float16], (1, 4096, 4096)>, <Int64, NoShape>) -> (<Tensor[Float32], (1, 4096, 4096)>)
-   ```
-
-3. Checking the reverse calculation input
-
-   The inputs to the reverse operators of Silu and Mul are checked in the IR diagram to see if they are as expected, and there are Reshape operators between Silu and Mul, and between Mul and MatMul when fine-grained multicopy is off, and Silu, Mul, and MatMul are connected when fine-grained multicopy is on. The process is as follows:
-
-![reshape](./images/reshape.png)
-
-It can be seen that the cause is that the input shape of Linear in the fine-grained multicopy scenario is two-dimensional, while the input shape of Linear in the non-fine-grained multicopy scenario is three-dimensional, so a Reshape operator between Linear and Mul, and the lack of Reshape recalculation results in recalculation of Silu being optimized. The additional recalculation of the Reshape results in a normal memory reduction. The reference configuration is as follows:
-
-```yaml
-recompute_config:
-  recompute: False
-  select_recompute: ['feed_forward\.mul', 'feed_forward\.w1\.activation', 'feed_forward\.w1\.reshape', 'feed_forward\.w2\.reshape']
-```
-
-### Llama2-13B Extreme Performance Optimization
-
-13B defaults to a single DP: 8, MP: 1, PP: 1 with full recalculation on, with performance around 1860tokens/s/p and 40% MFU, which is significantly lower compared to the 7B (53% MFU) & 70B (47% MFU).
-
-After analyzing, 13B performance bottleneck mainly lies in memory, whether single or multi-computer, if you don't slice MP, you need to turn on full recalculation, and doing selective recalculation for Silu and Mul memory is still not enough; full recalculation will be an additional 20% to 25% more computation, resulting in low performance; MP slices can be turned off the recalculation, but the performance is a little lower than the pure DP.
-
-Adjusting the sharding strategy to DP: 8, MP: 1, PP: 2, micro: 128 with dual machines and full recomputation on improves performance to 2136tokens/s/p. Changing the full recomputation to selective recomputation and fine selecting the operators to minimize the amount of memory at each layer improves performance to 2189tokens/s/p.
-
-```yaml
-select_recompute: ['feed_forward\.mul', 'feed_forward\.w1\.activation', 'feed_forward\.w1\.reshape', 'feed_forward\.w1\.matmul', 'feed_forward\.w3\.matmul', 'feed_forward\.W3\.reshape', 'feed_forward\.w2\.matmul', 'feed_forward\.w2\.reshape', 'ffn_norm\.norm', 'ffn_norm\.rcast', 'attention_norm\.norm', 'attention_norm\.rcast', 'attention\.wq\.reshape', 'attention\.wk\.reshape', 'attention\.wv\.reshape', 'attention\.wo\.matmul', 'attention\.wo\.reshape', 'attention\.merger_head_transpose', 'add', 'attention\.flash attention']
-```
-
-Adjusting the number of recomputation layers for different stages results in less recomputation for stage1 and performance improvement to 2210tokens/s/p.
-
-```yaml
-select_recompute:
-  'feed_forward\.mul': [20, 8]
-  'feed_forward\.w1\.activation': [20, 8]
-  'feed_forward\.w1\.matmul': [20, 0]
-  'feed_forward\.w1\.reshape': [20, 8]
-  'feed_forward\.w3\.matmul': [20, 0]
-  'feed_forward\.w3\.reshape': [20, 0]
-  'feed_forward\.w2\.matmul': [20, 0]
-  'feed_forward\.w2\.reshape': [20, 0]
-  'ffn_norm\.norm': [20, 0]
-  'ffn_norm\.rcast': [20, 0]
-  'attention_norm\.norm': [20, 0]
-  'attention_normi.rcast': [20, 0]
-  'attention\.wq\.reshape': [20, 0]e
-  'attention\.wk\.reshape': [20, 0]e
-  'attention\.w\.reshape': [20, 0]e
-  'attention\.wol.matmul': [20, 0]
-  'attention\.wo\.reshape': [20, 0]e
-  'attention\.merger head transpose': [20, 0]
-  'add': [20, 0]
-  'attention\.flash_attention': [20, 0]
-```
-
-Using graph compilation level of O0/O1 graph kernel fusion, there are further optimizations in memory, changing the selective recomputation of most of the operators to full recomputation of some layers, and configuring selective recomputation of Silu and Mul for the rest of the layers. The number of fully-recomputed layers in stage0 and stage1 is 13 and 5 respectively, and the performance improves to 2,353tokens/s/p. Gradually the number of fully-recomputed layers in stage0 and stage1 are 4 and 0 respectively, and the performance is improved to 2562tokens/s/p (max_device_memory: 57.2GB). The reference configuration is as follows:
-
-```yaml
-recompute_config:
-  recompute: [4, 0]
-  select_recompute: ['feed_forward\.mul', 'feed_forward\.w1\.activation', 'feed_forward\.w1\.reshape', 'feed_forward\.w2\.reshape']
-```
-
-After the final tuning, the Llama2-13B performance was optimized to 2562tokens/s/p, for a total improvement of 37%.
-
-### Llama Multi-Card Cluster Training Tuning
-
-Based on the Llama2-70B model configuration, adjust the model hyperparameter, expand the number of parameters to xxxB, use 1024 card cluster + shared storage for training, and set the GBS (global batch size) to 128. The following performance bottleneck analysis for this case is given as a reference for optimization.
-
-**Case Bottleneck Analysis**:
-
-Firstly, the approximate memory required for model training is tested by DryRun to determine the overall slicing strategy, on the basis of which adjustments are made, and the initial slicing strategy obtained: `DP=8 MP=8 PP=16 micro_batch_num=16`.
-
-The initial slicing strategy was tested to collect performance and memory data to analyze the performance bottlenecks in this scenario as follows:
-
-* **IO Bottleneck**: Thousands of cards accessing shared storage to read data at the same time. The storage pressure is too high to catch up with the training speed, resulting in performance fluctuations;
-* **Large Vocabulary List Memory Bottleneck**: The vocab_size of the custom hyperparameter is on the large side, causing the embedding and lm_head structures to take up too much memory;
-* **Unmasked Communication Bottleneck**: With the mp parallel count set to 8, the communication volume is relatively high and more unmasked communication occurs;
-* **To Much bubbles**: The PP stage slices reach 16, while micro_batch_num is limited to 16 by the gbs, so that there are too many bubbles in the pipeline flow;
-* **Load Imbalance Between Stages**: stage 0 and stage 1 memory consumption is too high and the load balancing policy needs to be adjusted.
-
-**Optimization methods**:
-
-For the bottleneck points analyzed above, we can apply the following optimization methods:
-
-1. Read data using full_batch=false: optimizes IO reads, reduces IO pressure, and solves performance fluctuations caused by IO bottlenecks;
-
-   Refer to [IO bottlenecks chapter](#io-bottleneck) for full_batch related usage description. Here the sample configuration of dp8 is:
-
-   ```yaml
-   parallel:             # In the parallel module
-     ...
-     full_batch: False   # Set full batch to False
-     dataset_strategy: [[8, 1],] # dp is 8, one input only
-     ...
-   ```
-
-2. Embedding parameter configuration optimizer parallelism: large vocabulary occupies too much memory, and the optimizer parallelism of vocabulary weights needs additional configuration, which effectively alleviates the problem of insufficient memory in the first stage;
-
-   An introduction to the use of optimizer parallelism can be found in [MindSpore Optimizer Parallelism Documentation](https://www.mindspore.cn/tutorials/en/r2.6.0/parallel/optimizer_parallel.html). In addition, the Llama model has additional configurations for optimizers in the embedding layer, the `parallel_optimizer` in the [LlamaConfig API documentation](https://www.mindspore.cn/mindformers/docs/en/dev/models/mindformers.models.LlamaConfig.html#mindformers.models.LlamaConfig) controls the parallelism of the embedding optimizer;
-   A sample configuration is shown below:
-
-   ```yaml
-   parallel:
-     ...
-     enable_parallel_optimizer: True  # Enable global optimizer parallel
-     ...
-
-   model:
-     model_config:
-       ...
-       parallel_optimizer: True       # Configure optimizer parallelism for embedding layer
-       ...
-   ```
-
-3. Enable Llama's `fine-grained multi-copy` policy masks most of the communication behavior under the model-parallel policy;
-
-   An introduction to multi-copy parallel can be found in the [MindSpore Multicopy Parallelism Documentation](https://www.mindspore.cn/tutorials/en/r2.6.0/parallel/multiple_copy.html), and the behavior of fine-grained multicopy parallelism can be configured in MindSpore Transformers through the ` fine_grain_interleave` item. The reference configuration is as follows:
-
-   ```yaml
-   model:
-     model_config:
-       ...
-       fine_grain_interleave: 2       # Configure the number of fine-grained multicopy copies, with a default value of 1 to disable it and 2 to enable computational communication masking
-       ...
-   ```
-
-4. Enable the `pp_interleave` parallel policy and configure `pp_interleave_num` to 3 to effectively reduce the percentage of bubbles;
-
-   An introduction to the multi-streaming interleaving feature can be found in the [MindSpore pipeline parallelism documentation](https://www.mindspore.cn/tutorials/en/r2.6.0/parallel/pipeline_parallel.html). In MindSpore Transformers the reference configuration is as follows:
-
-   ```yaml
-   parallel:
-     ...
-     pipeline_config:
-       pipeline_interleave: true    # Enable multi-stream interweaving
-       pipeline_scheduler: '1f1b'   # Scheduling method as 1f1b
-     ...
-
-   model:
-     model_config:
-       ...
-       pp_interleave_num: 3    # The number of multi-stream interweaving copies is configured as 3
-       ...
-   ```
-
-5. Adjust the load between stages, configure `offset` to spread the layers from the first two stages to the subsequent layers with free video memory;
-
-   An introduction to load balancing can be found in [previous load balancing section](#load-balance-policy-tuning), where offset is configured as follows after combining the `pp_interleave_num: 3` configuration:
-
-   ```yaml
-   model:
-     model_config:
-       ...
-       offset: [[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1]]
-       ...
-   ```
-
-   With a `pp_interleave_num` of 3, offset should be configured as three sublists corresponding to the number of flow slices. The length of each sublist is the number of pipeline stages, representing the number of layers that need to be added or subtracted from that position. For the above configuration, stage 0 reduces the load by two layers, allocated to the penultimate two stages.
-
-6. Fine-tune the recomputation strategy for each stage so that each stage uses as much video memory as possible to get the best performance.
-
-   This part can be completed with [SAPP automatic load balancing tool](#sapp-automatic-load-balancing-tool). The recomputation policy configuration obtained after optimization is as follows:
-
-   ```yaml
-   select_recompute:
-     'feed_forward\.mul': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 1]]
-     'feed_forward\.w1\.activation\.silu': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 1]]
-     'feed_forward\.w1\.reshape': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 1]]
-     'feed_forward\.w2\.reshape': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 1]]
-     'add': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 1]]
-     'cast_up': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 1]]
-   select_comm_recompute:
-     '.*\.norm': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1]]
-     'attention\.wq\.reshape': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1]]
-     'attention\.wk\.reshape': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1]]
-     'attention\.wv\.reshape': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1]]
-     'feed_forward\.w1\.reshape': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1]]
-     'feed_forward\.w3\.reshape': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1]]
-   ```
-
-**Optimization Result**:
-
-After the above bottleneck analysis and targeted optimization adjustments, the training performance has been significantly improved to 1.7 times of the pre-optimization (measured data in the then environment, for reference only).
-
-The above tuning case reflects how we analyze the performance bottlenecks, find available optimization means, and gradually approach the optimal performance configuration of the tuning idea. We hope that this paper can help readers grasp the overall tuning idea, and various different tuning scenarios can be analyzed to clarify the direction of performance optimization, and obtain good training performance.
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_en/quick_start/install.md b/docs/mindformers/docs/source_en/quick_start/install.md
deleted file mode 100644
index 9c5920be83161a20b6220bd7f28cce1f32952019..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_en/quick_start/install.md
+++ /dev/null
@@ -1,52 +0,0 @@
-# Installation
-
-[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_en/quick_start/install.md)
-
-## Confirming Version Matching Relationship
-
-The currently supported hardware is the [Atlas 800T A2](https://www.hiascend.com/hardware/ai-server?tag=900A2) training server.
-
-The current recommended Python version for the suite is 3.11.4.
-
-|      MindSpore Transformers       |       MindSpore        |          CANN          |   Firmware & Drivers   |  Mirror Links  |
-|:----------------------:|:----------------------:|:----------------------:|:----------------------:|:--------------:|
-| In-Development Version | In-Development Version | In-Development Version | In-Development Version | Not applicable |
-
-**Currently MindSpore Transformers recommends using a software package relationship as above.**
-
-Historical version matching relationship:
-
-|                     MindSpore Transformers                      |                  MindSpore                  |                                                                         CANN                                                                         |                                                                  Firmware & Drivers                                                                   |                             Mirror Links                             |
-|:----------------------------------------------------:|:-------------------------------------------:|:----------------------------------------------------------------------------------------------------------------------------------------------------:|:-----------------------------------------------------------------------------------------------------------------------------------------------------:|:--------------------------------------------------------------------:|
-| [1.3.2](https://pypi.org/project/mindformers/1.3.2/) | [2.4.10](https://www.mindspore.cn/install/) | [8.0.0](https://www.hiascend.com/document/detail/zh/canncommercial/800/softwareinst/instg/instg_0000.html?Mode=PmIns&OS=Ubuntu&Software=cannToolKit) | [24.1.0](https://www.hiascend.com/document/detail/zh/canncommercial/800/softwareinst/instg/instg_0000.html?Mode=PmIns&OS=Ubuntu&Software=cannToolKit) | [Link](http://mirrors.cn-central-221.ovaijisuan.com/detail/168.html) |
-| [1.2.0](https://pypi.org/project/mindformers/1.2.0/) | [2.3.0](https://www.mindspore.cn/install/)  |                     [8.0.RC2.beta1](https://www.hiascend.com/developer/download/community/result?module=cann&cann=8.0.RC2.beta1)                     |                                       [24.1.RC2](https://www.hiascend.com/hardware/firmware-drivers/community)                                        | [Link](http://mirrors.cn-central-221.ovaijisuan.com/detail/138.html) |
-
-## Installing Dependent Software
-
-1. Install Firmware and Driver: Download the firmware and driver package through the [Confirming Version Matching Relationship](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/quick_start/install.html#confirming-version-matching-relationship) to download the installation package, and refer to the [Ascend official tutorial](https://www.hiascend.com/document/detail/zh/quick-installation/24.0.RC1/quickinstg_train/800_9000A2/quickinstg_800_9000A2_0007.html) for installation.
-
-2. Install CANN and MindSpore: Use the officially provided Docker image (CANN, MindSpore are already included in the image, no need to install them manually) or follow the [Manual Installation](https://www.mindspore.cn/install/en) section on the MindSpore website for installation.
-
-## Installing MindSpore Transformers
-
-Currently only source code compilation installation is supported for in-development version, users can execute the following command to install MindSpore Transformers:
-
-```bash
-git clone -b r1.5.0 https://gitee.com/mindspore/mindformers.git
-cd mindformers
-bash build.sh
-```
-
-## Installation Verification
-
-To determine whether MindSpore Transformers has been successfully installed, execute the following code:
-
-```bash
-python -c "import mindformers as mf;mf.run_check()"
-```
-
-A similar result as below proves that the installation was successful:
-
-```text
-- INFO - All checks passed, used **** seconds, the environment is correctly set up!
-```
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_en/quick_start/source_code_start.md b/docs/mindformers/docs/source_en/quick_start/source_code_start.md
deleted file mode 100644
index 8c325681ebb5590f47c32eadb29de4351e470dad..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_en/quick_start/source_code_start.md
+++ /dev/null
@@ -1,110 +0,0 @@
-# Calling Source Code to Start
-
-[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_en/quick_start/source_code_start.md)
-
-This section shows how to use MindSpore Transformers to quickly pull up a LoRA low-parameter fine-tuning task based on the Llama2-7B model. To use other models and tasks via MindSpore Transformers, please read the corresponding [model documentation](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/start/models.html).
-
-## Preparing Weights File
-
-MindSpore Transformers provides pre-trained weights and word list files that have been converted for pre-training, fine-tuning and inference. Users can also download the official HuggingFace weights and use them after converting the model weights. For convenience, this file won't go into too much detail about converting the original weights here, but you can refer to the [Llama2 documentation](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/llama2.md) and [weight conversion](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/function/weight_conversion.html) for more details. Please download the `MindSpore` weights, the converted `.ckpt` file, and the `tokenizer.model` file for subsequent processing.
-
-| Model Name | MindSpore Weights | HuggingFace Weights |
-| ------ | ------ | ------ |
-| Llama2-7B | [llama2_7b.ckpt](https://ascend-repo-modelzoo.obs.cn-east-2.myhuaweicloud.com/MindFormers/llama2/llama2_7b.ckpt) | [Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf) |
-
-Word list download link: [tokenizer.model](https://ascend-repo-modelzoo.obs.cn-east-2.myhuaweicloud.com/MindFormers/llama2/tokenizer.model)
-
-## Preparing Dataset
-
-1. The dataset file alpaca_data.json used in the fine-tuning process can be obtained at [Stanford Alpaca](https://github.com/tatsu-lab/stanford_alpaca).
-
-2. Data Preprocessing
-
-    The following command needs to be executed in the MindSpore Transformers code root directory, and replaces {path} below with the local path where the dataset files are stored.
-
-    1. Execute [mindformers/tools/dataset_preprocess/llama/alpaca_converter.py](https://gitee.com/mindspore/mindformers/blob/r1.5.0/mindformers/tools/dataset_preprocess/llama/alpaca_converter.py), and add prompt templates to convert the raw dataset into a multi-round conversation format.
-
-        ```shell
-          python mindformers/tools/dataset_preprocess/llama/alpaca_converter.py \
-            --data_path /{path}/alpaca_data.json \
-            --output_path /{path}/alpaca-data-conversation.json
-        ```
-
-        **Parameter descriptions**
-
-        - data_path: Input the path to the downloaded file.
-        - output_path: Save path of the output file.
-
-    2. Execute [mindformers/tools/dataset_preprocess/llama/llama_preprocess.py](https://gitee.com/mindspore/mindformers/blob/r1.5.0/mindformers/tools/dataset_preprocess/llama/llama_preprocess.py), and generate MindRecord data and convert data with prompt templates to MindRecord format.
-
-        ```shell
-          python mindformers/tools/dataset_preprocess/llama/llama_preprocess.py \
-            --dataset_type qa \
-            --input_glob /{path}/alpaca-data-conversation.json \
-            --model_file /{path}/tokenizer.model \
-            --seq_length 4096 \
-            --output_file /{path}/alpaca-fastchat4096.mindrecord
-        ```
-
-        **Parameter descriptions**
-
-        - dataset_type: Preprocessed data types. The options include "wiki" and "qa."
-            - "wiki" is used to process the Wikitext2 dataset, which is suitable for the pre-training and evaluation stages.
-            - "qa" is used to process the Alpaca dataset, converting it into a question-answer format, which is suitable for the fine-tuning stage.
-                For other dataset conversion scripts, please refer to the corresponding [model documentation](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/start/models.html).
-        - input_glob: Path to the converted alpaca file.
-        - model_file: Path to the model tokenizer.model file.
-        - seq_length: Sequence length of the output data.
-        - output_file: Save path of the output file.
-
-    3. The console outputs the following, proving that the format conversion was successful.
-
-        ```shell
-          # Console outputs
-          Transformed 52002 records.
-          Transform finished, output files refer: {path}/alpaca-fastchat4096.mindrecord
-        ```
-
-## Initiating Fine-tuning
-
-In the MindSpore Transformers code root directory, execute the following command to launch the fine-tuning task:
-
-```shell
-bash scripts/msrun_launcher.sh "run_mindformer.py \
- --config configs/llama2/lora_llama2_7b.yaml \
- --train_dataset_dir /{path}/alpaca-fastchat4096.mindrecord \
- --load_checkpoint /{path}/llama2_7b.ckpt \
- --auto_trans_ckpt True \
- --use_parallel True \
- --run_mode finetune" 8
-```
-
-**Command Explanation:**  
-
-- `scripts/msrun_launcher.sh`: Script for launching distributed tasks.  
-- `"run_mindformer.py ..."`: Parameter string for the Python task executed on each card, including:  
-    - `run_mindformer.py`: One-click startup script.  
-    - `--config`: Specifies the task configuration file path, e.g., `configs/llama2/lora_llama2_7b.yaml`.  
-    - `--train_dataset_dir`: Specifies the dataset path, e.g., `/{path}/alpaca-fastchat4096.mindrecord`.  
-    - `--load_checkpoint`: Specifies the checkpoint file path, e.g., `/{path}/llama2_7b.ckpt`.  
-    - `--auto_trans_ckpt True`: Enables automatic checkpoint partitioning.  
-    - `--use_parallel True`: Enables distributed task execution.  
-    - `--run_mode finetune`: Sets the run mode to fine-tuning.  
-- `8`: Sets the task to runs on 8 NPUs.  
-
-When the following log appears on the console:
-
-```shell
-Start worker process with rank id:0, log file:output/msrun_log/worker_0.log. Environment variable [RANK_ID=0] is exported.
-Start worker process with rank id:1, log file:output/msrun_log/worker_1.log. Environment variable [RANK_ID=1] is exported.
-Start worker process with rank id:2, log file:output/msrun_log/worker_2.log. Environment variable [RANK_ID=2] is exported.
-Start worker process with rank id:3, log file:output/msrun_log/worker_3.log. Environment variable [RANK_ID=3] is exported.
-Start worker process with rank id:4, log file:output/msrun_log/worker_4.log. Environment variable [RANK_ID=4] is exported.
-Start worker process with rank id:5, log file:output/msrun_log/worker_5.log. Environment variable [RANK_ID=5] is exported.
-Start worker process with rank id:6, log file:output/msrun_log/worker_6.log. Environment variable [RANK_ID=6] is exported.
-Start worker process with rank id:7, log file:output/msrun_log/worker_7.log. Environment variable [RANK_ID=7] is exported.
-```
-
-It indicates that the fine-tuning task is started, the progress can be monitored in the `output/msrun_log/` directory.
-
-For more details on Llama2, and more startup approaches, please refer specifically to the `Llama2` [README](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/llama2.md#llama-2) documentation for more support.
diff --git a/docs/mindformers/docs/source_en/start/models.md b/docs/mindformers/docs/source_en/start/models.md
deleted file mode 100644
index 214632ff40ee3af7479270a86c093dd99675c074..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_en/start/models.md
+++ /dev/null
@@ -1,55 +0,0 @@
-# Models
-
-[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_en/start/models.md)
-
-The following table lists models supported by MindFormers.
-
-| Model                                                                                                   | Specifications                |    Model Type    |     Latest Version     |
-|:--------------------------------------------------------------------------------------------------------|:------------------------------|:----------------:|:----------------------:|
-| [CodeLlama](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/codellama.md)             | 34B                           |    Dense LLM     | In-development version |
-| [CogVLM2-Image](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/cogvlm2_image.md)     | 19B                           |        MM        | In-development version |
-| [CogVLM2-Video](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/cogvlm2_video.md)     | 13B                           |        MM        | In-development version |
-| [DeepSeek-V3](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/deepseek3)                      | 671B                          |    Sparse LLM    | In-development version |
-| [DeepSeek-V2](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/deepseek2)                      | 236B                          |    Sparse LLM    | In-development version |
-| [DeepSeek-Coder-V1.5](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/deepseek1_5)            | 7B                            |    Dense LLM     | In-development version |
-| [DeepSeek-Coder](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/deepseek)                    | 33B                           |    Dense LLM     | In-development version |
-| [GLM4](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/glm4.md)                       | 9B                            |    Dense LLM     | In-development version |
-| [GLM3-32K](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/glm32k)                            | 6B                            |    Dense LLM     | In-development version |
-| [GLM3](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/glm3.md)                       | 6B                            |    Dense LLM     | In-development version |
-| [InternLM2](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/internlm2)                        | 7B/20B                        |    Dense LLM     | In-development version |
-| [Llama3.1](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/llama3_1)                          | 8B/70B                        |    Dense LLM     | In-development version |
-| [Llama3](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/llama3)                              | 8B/70B                        |    Dense LLM     | In-development version |
-| [Llama2](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/llama2.md)                   | 7B/13B/70B                    |    Dense LLM     | In-development version |
-| [Mixtral](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/mixtral)                            | 8x7B                          |    Sparse LLM    | In-development version |
-| [Qwen2](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/qwen2)                                | 0.5B/1.5B/7B/57B/57B-A14B/72B | Dense/Sparse LLM | In-development version |
-| [Qwen1.5](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/qwen1_5)                            | 7B/14B/72B                    |    Dense LLM     | In-development version |
-| [Qwen-VL](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/qwenvl)                             | 9.6B                          |        MM        | In-development version |
-| [Whisper](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/whisper.md)                 | 1.5B                          |        MM        | In-development version |
-| [Yi](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/yi)                                      | 6B/34B                        |    Dense LLM     | In-development version |
-| [Baichuan2](https://gitee.com/mindspore/mindformers/blob/r1.3.0/research/baichuan2/baichuan2.md)        | 7B/13B                        |    Dense LLM     |         1.3.2          |
-| [GLM2](https://gitee.com/mindspore/mindformers/blob/r1.3.0/docs/model_cards/glm2.md)                    | 6B                            |    Dense LLM     |         1.3.2          |
-| [GPT2](https://gitee.com/mindspore/mindformers/blob/r1.3.0/docs/model_cards/gpt2.md)                    | 124M/13B                      |    Dense LLM     |         1.3.2          |
-| [InternLM](https://gitee.com/mindspore/mindformers/blob/r1.3.0/research/internlm/internlm.md)           | 7B/20B                        |    Dense LLM     |         1.3.2          |
-| [Qwen](https://gitee.com/mindspore/mindformers/blob/r1.3.0/research/qwen/qwen.md)                       | 7B/14B                        |    Dense LLM     |         1.3.2          |
-| [CodeGeex2](https://gitee.com/mindspore/mindformers/blob/r1.1.0/docs/model_cards/codegeex2.md)          | 6B                            |    Dense LLM     |         1.1.0          |
-| [WizardCoder](https://gitee.com/mindspore/mindformers/blob/r1.1.0/research/wizardcoder/wizardcoder.md)  | 15B                           |    Dense LLM     |         1.1.0          |
-| [Baichuan](https://gitee.com/mindspore/mindformers/blob/r1.0/research/baichuan/baichuan.md)             | 7B/13B                        |    Dense LLM     |          1.0           |
-| [Blip2](https://gitee.com/mindspore/mindformers/blob/r1.0/docs/model_cards/blip2.md)                    | 8.1B                          |        MM        |          1.0           |
-| [Bloom](https://gitee.com/mindspore/mindformers/blob/r1.0/docs/model_cards/bloom.md)                    | 560M/7.1B/65B/176B            |    Dense LLM     |          1.0           |
-| [Clip](https://gitee.com/mindspore/mindformers/blob/r1.0/docs/model_cards/clip.md)                      | 149M/428M                     |        MM        |          1.0           |
-| [CodeGeex](https://gitee.com/mindspore/mindformers/blob/r1.0/research/codegeex/codegeex.md)             | 13B                           |    Dense LLM     |          1.0           |
-| [GLM](https://gitee.com/mindspore/mindformers/blob/r1.0/docs/model_cards/glm.md)                        | 6B                            |    Dense LLM     |          1.0           |
-| [iFlytekSpark](https://gitee.com/mindspore/mindformers/blob/r1.0/research/iflytekspark/iflytekspark.md) | 13B                           |    Dense LLM     |          1.0           |
-| [Llama](https://gitee.com/mindspore/mindformers/blob/r1.0/docs/model_cards/llama.md)                    | 7B/13B                        |    Dense LLM     |          1.0           |
-| [MAE](https://gitee.com/mindspore/mindformers/blob/r1.0/docs/model_cards/mae.md)                        | 86M                           |        MM        |          1.0           |
-| [Mengzi3](https://gitee.com/mindspore/mindformers/blob/r1.0/research/mengzi3/mengzi3.md)                | 13B                           |    Dense LLM     |          1.0           |
-| [PanguAlpha](https://gitee.com/mindspore/mindformers/blob/r1.0/docs/model_cards/pangualpha.md)          | 2.6B/13B                      |    Dense LLM     |          1.0           |
-| [SAM](https://gitee.com/mindspore/mindformers/blob/r1.0/docs/model_cards/sam.md)                        | 91M/308M/636M                 |        MM        |          1.0           |
-| [Skywork](https://gitee.com/mindspore/mindformers/blob/r1.0/research/skywork/skywork.md)                | 13B                           |    Dense LLM     |          1.0           |
-| [Swin](https://gitee.com/mindspore/mindformers/blob/r1.0/docs/model_cards/swin.md)                      | 88M                           |        MM        |          1.0           |
-| [T5](https://gitee.com/mindspore/mindformers/blob/r1.0/docs/model_cards/t5.md)                          | 14M/60M                       |    Dense LLM     |          1.0           |
-| [VisualGLM](https://gitee.com/mindspore/mindformers/blob/r1.0/research/visualglm/visualglm.md)          | 6B                            |        MM        |          1.0           |
-| [Ziya](https://gitee.com/mindspore/mindformers/blob/r1.0/research/ziya/ziya.md)                         | 13B                           |    Dense LLM     |          1.0           |
-| [Bert](https://gitee.com/mindspore/mindformers/blob/r0.8/docs/model_cards/bert.md)                      | 4M/110M                       |    Dense LLM     |          0.8           |
-
-&#42; ***LLM:*** *Large Language Model;* ***MM:*** *Multi-Modal*
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_en/start/overview.md b/docs/mindformers/docs/source_en/start/overview.md
deleted file mode 100644
index 35abd75bc673242b8763485758925febe2d8995a..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_en/start/overview.md
+++ /dev/null
@@ -1,13 +0,0 @@
-# Overall Structure
-
-[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_en/start/overview.md)
-
-The overall architecture formed by MindSpore Transformers and the end-to-end AI hardware and software ecosystem of MindSpore and Ascend is as follows:
-
-1. At the hardware level, MindSpore Transformers supports users running large models on Ascend servers;
-2. At the software level, MindSpore Transformers implements the big model-related code through the Python interface provided by MindSpore and performs data computation by the operator libraries provided by the supporting software package of the Ascend AI processor;
-3. The basic functionality features currently supported by MindSpore Transformers are listed below:
-   1. Supports tasks such as running training and inference for large models [distributed parallelism](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/function/distributed_parallel.html), with parallel capabilities including data parallelism, model parallelism, ultra-long sequence parallelism;
-   2. Supports [model weight conversion](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/function/weight_conversion.html), [distributed weight splitting and combination](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/function/transform_weight.html), and different format of [dataset loading](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/function/dataset.html) and [resumable training after breakpoint](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/function/resume_training.html);
-   3. Support 25+ large models [pretraining](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/usage/pre_training.html), [fine-tuning](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/usage/sft_tuning.html), [inference](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/usage/inference.html) and [evaluation] (https://www.mindspore.cn/mindformers/docs/en/r1.5.0/usage/evaluation.html). Meanwhile, it also supports [quantization](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/usage/quantization.html), and the list of supported models can be found in [Model Library](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/start/models.html);
-4. MindSpore Transformers supports users to carry out model service deployment function through [MindIE](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/usage/mindie_deployment.html), and also supports the use of [MindX]( https://www.hiascend.com/software/mindx-dl) to realize large-scale cluster scheduling; more third-party platforms will be supported in the future, please look forward to it.
diff --git a/docs/mindformers/docs/source_en/usage/dev_migration.md b/docs/mindformers/docs/source_en/usage/dev_migration.md
deleted file mode 100644
index 168ec938ba841e0fddb73f9ef742163334367dcf..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_en/usage/dev_migration.md
+++ /dev/null
@@ -1,139 +0,0 @@
-# Development Migration
-
-[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_en/usage/dev_migration.md)
-
-This document describes how to develop and build foundation models based on MindSpore Transformers and complete basic adaptation to start the training and inference processes.
-
-## Building a Foundation Model Based on MindSpore Transformers
-
-The basic components of a foundation model in MindSpore Transformers include the configurations, models, and tokenizers for large language models (LLMs). In addition, to use the run_mindformer.py unified script to start the training or inference process, you need to prepare the `YAML` configuration file for training or inference.
-
-### Writing Configurations
-
-A model configuration is an instance that contains all information about a model. The `__init__` methods of all models in MindSpore Transformers receive a model configuration instance as the input parameter. All submodules of the model are initialized based on the information contained in the configuration instance.
-
-MindSpore Transformers provides the [PretrainedConfig](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/models/mindformers.models.PretrainedConfig.html) class, which provides some common configuration methods. The configuration classes of all models should be inherited from the PretrainedConfig class. Developers only need to define all configuration parameters that help build foundation models. Foundation models of the Transformer type have configuration parameters such as `seq_length`, `hidden_size`, `num_layers`, and `num_heads`, and foundation models of the text type have `vocab_size` in addition.
-
-For details, see the configuration class [LlamaConfig](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/models/mindformers.models.LlamaConfig.html) of the Llama model in MindSpore Transformers.
-
-> If your model is similar to a model in the library, you can reuse the same configurations as the model.
-
-### Writing a Model
-
-The MindSpore Transformers foundation model is developed based on the MindSpore framework. Developers only need to pay attention to the implementation of the model network.
-
-MindSpore Transformers provides the [PretrainedModel](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/models/mindformers.models.PreTrainedModel.html) class, which is responsible for storage model configurations and processing the methods of loading and saving models. All model classes must be inherited from the PretrainedModel class, and the model input must be the same. That is, the input parameters of the `construct` method of the model must be the same. For details about the input parameters and meanings, see the Llama model class [LlamaForCausalLM](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/models/mindformers.models.LlamaForCausalLM.html) in MindSpore Transformers. In addition, the model class must implement some abstract methods of the base class, including:
-
-- `prepare_inputs_for_generation`: method for building input for model inference.
-- `prepare_inputs_for_predict_layout`: method for building virtual input for the distributed loading model weight.
-
-For specific meanings, refer to the descriptions in [LlamaForCausalLM](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/models/mindformers.models.LlamaForCausalLM.html).
-
-> If your model structure is similar to that of a model in the library, you can reuse the model.
-
-### Writing a Tokenizer (for LLMs)
-
-A tokenizer is used to process input and output of LLMs. It is required in the workflow of LLMs.
-
-MindSpore Transformers provides the [PretrainedTokenizer](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/models/mindformers.models.PreTrainedTokenizer.html) and [PretrainedTokenizerFast](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/models/mindformers.models.PreTrainedTokenizerFast.html) classes, which use Python only and use the Rust library, respectively. The features of the latter one are as follows:
-
-- Faster batch processing.
-- Additional methods for mapping between text strings and lexical spaces. For example, the indexes of the lexical element containing a given character or the character spans corresponding to the given lexical element are obtained.
-
-All tokenizer classes must be inherited from the PretrainedTokenizer or PretrainedTokenizerFast class. For details, see [LlamaTokenizer](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/models/mindformers.models.LlamaTokenizer.html) and [LlamaTokenizerFast](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/models/mindformers.models.LlamaTokenizerFast.html).
-
-> If your tokenizer is similar to that in the library, you can reuse that in the library.
-
-### Preparing a Weight and a Dataset
-
-If a PyTorch-based model weight already exists, you can convert the weight to that in the MindSpore format by referring to [Weight Conversion](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/function/weight_conversion.html).
-
-For details about how to prepare a dataset, see [Dataset](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/function/dataset.html) or the model document, for example, [Llama2 Description Document > Dataset Preparation](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/llama2.md#%E6%95%B0%E6%8D%AE%E5%8F%8A%E6%9D%83%E9%87%8D%E5%87%86%E5%A4%87).
-
-### Preparing a `YAML` Configuration File
-
-MindSpore Transformers uses a `YAML` file to configure all parameters required by a task, including model parameters, training parameters (such as optimizer, learning rate, and dataset), inference parameters (such as tokenizer), distributed parallel parameters, and context environment parameters.
-
-The code of the customized model is not in the MindSpore Transformers library, and the customized module in the code is not registered with MindSpore Transformers. Therefore, the customized model cannot be automatically instantiated. The code is also called external code (for example, the code in the `research` directory). Therefore, you need to add the `auto_register` configuration item for automatically registering any module to the corresponding module configuration in the `YAML` file and set the configuration items to the relative import paths of the API to be registered. When the run_mindformer.py script is executed to start the task, you need to add the input parameter `--register_path` of the registration path and set it to the relative path of the directory where the external code is located.
-
-For example, in the `YAML` file [`research/llama3_1/predict_llama3_1_8b.yaml`](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/llama3_1/llama3_1_8b/predict_llama3_1_8b.yaml) of the Llama3.1-8B model inference in the `research` directory, the configuration item `auto_register` is added for automatic registration to register the customized `Llama3Tokenizer` in [`research/llama3_1/llama3_1_tokenizer.py`](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/llama3_1/llama3_1_tokenizer.py).
-
-```yaml
-...
-processor:
-  return_tensors: ms
-  tokenizer:
-    model_max_length: 8192
-    vocab_file: "/path/tokenizer.model"
-    pad_token: "<|reserved_special_token_0|>"
-    type: Llama3Tokenizer
-    auto_register: llama3_1_tokenizer.Llama3Tokenizer
-  type: LlamaProcessor
-...
-```
-
-The relative import path `auto_register: llama3_1_tokenizer.Llama3Tokenizer` of `Llama3Tokenizer` is configured under `tokenizer`.
-
-Also, `vocab_file` under `tokenizer` should configure as the real path to the tokenizer `tokenizer.model`.
-
-Run the following command to start the inference job:
-
-```bash
-python run_mindformer.py --config research/llama3_1/predict_llama3_1_8b.yaml --load_checkpoint path/to/llama3_1_8b.ckpt --register_path research/llama3_1 --predict_data "hello"
-```
-
-**Parameters**
-
-|    Parameter    | Description                                               |
-|:---------------:|:----------------------------------------------------------|
-|     config      | Path of the `YAML` file.                                  |
-| load_checkpoint | Loaded weight path.                                       |
-|  register_path  | Path of the directory where the external code is located. |
-|  predict_data   | Input data for inference.                                 |
-
-`register_path` is set to `research/llama3_1` (path of the directory where the external code is located). For details about how to prepare the model weight, see [Llama3.1 Description Document > Model Weight Download](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/llama3_1/README.md#%E6%A8%A1%E5%9E%8B%E6%9D%83%E9%87%8D%E4%B8%8B%E8%BD%BD).
-
-For details about the configuration file and configurable items, see [Configuration File Descriptions](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/appendix/conf_files.html). When compiling a configuration file, you can refer to an existing configuration file in the library, for example, [Llama2-7B fine-tuning configuration file](https://gitee.com/mindspore/mindformers/blob/r1.5.0/configs/llama2/finetune_llama2_7b.yaml).
-
-After all the preceding basic elements are prepared, you can refer to other documents in the MindSpore Transformers tutorial to perform model training, fine-tuning, and inference. For details about subsequent model debugging and optimization, see [Large Model Accuracy Optimization Guide](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/acc_optimize/acc_optimize.html) and [Large Model Performance Optimization Guide](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/perf_optimize/perf_optimize.html).
-
-### Contributing Models to the MindSpore Transformers Open Source Repository
-
-You can contribute models to the MindSpore Transformers open source repository for developers to research and use. For details, see [MindSpore Transformers Contribution Guidelines](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/faq/mindformers_contribution.html).
-
-## MindSpore Transformers Model Migration Practice
-
-### Migration from Llama2-7B to Llama3-8B
-
-Llama3-8B and Llama2-7B have the same model structure but different model parameters, tokenizers, and weights.
-
-#### Model Configurations
-
-The following compares the model configurations between Llama2-7B and Llama3-8B.
-
-![model_config_comparison](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/docs/mindformers/docs/source_zh_cn/usage/image/model_config_comparison.png)
-
-The differences are as follows:
-
-- The sequence length of Llama3-8B is 8192. Therefore, `seq_length` is set to `8192`.
-- Llama3-8B uses GQA and the number of heads in each key-value group is 8. Therefore, `n_kv_head` is set to `8`.
-- The size of the Llama3-8B vocabulary is 128,256. Therefore, `vocab_size` is set to `128256`.
-- Llama3-8B expands the hidden layer size of the feed-forward network to 14,336. Therefore, `intermediate_size` is set to `14336`.
-- In Llama3-8B, the special word metaindex is modified. Therefore, `bos_token_id` is set to `128000`, `eos_token_id` is set to `128001`, and `pad_token_id` is set to `128002`.
-- In Llama3-8B, the value of **theta** in the rotation position code is changed to **500000**. Therefore, `theta` is set to `500000`.
-
-After modifying the corresponding content in the `YAML` file of Llama2-7B, you can obtain the [Llama3-8B configuration file](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/llama3/llama3_8b/finetune_llama3_8b.yaml).
-
-#### Tokenizer
-
-Llama3-8B re-implements the tokenizer. According to the official implementation, PretrainedTokenizer is inherited from MindSpore Transformers to implement Llama3Tokenizer, which is written in [llama3_tokenizer.py](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/llama3/llama3_tokenizer.py).
-
-#### Weight Conversion
-
-The parameters of Llama3-8B are the same as those of Llama2-7B. Therefore, the weight conversion process of Llama2-7B can be reused. For details, see [Llama3 Document > Weight Conversion](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/llama3/README.md#%E6%A8%A1%E5%9E%8B%E6%9D%83%E9%87%8D%E8%BD%AC%E6%8D%A2).
-
-#### Dataset Processing
-
-The tokenizer of Llama3-8B is different from that of Llama2-7B. Therefore, you need to replace the tokenizer of Llama3-8B to preprocess data based on the dataset processing script of Llama2-7B. For details, see [conversation.py](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/llama3/llama3_conversation.py) and [llama_preprocess.py](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/llama3/llama3_preprocess.py).
-
-For details about the implementation of Llama3 in MindSpore Transformers, see [Llama3 folder](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/llama3) in the MindSpore Transformers repository. For details about how to use Llama3 in MindSpore Transformers, see [LLama3 documents](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/llama3/README.md).
diff --git a/docs/mindformers/docs/source_en/usage/evaluation.md b/docs/mindformers/docs/source_en/usage/evaluation.md
deleted file mode 100644
index be8f8b3b41ad3b79cb98233b69c7dbeead8b072b..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_en/usage/evaluation.md
+++ /dev/null
@@ -1,540 +0,0 @@
-# Evaluation
-
-[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_en/usage/evaluation.md)
-
-## Harness Evaluation
-
-### Introduction
-
-[LM Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) is an open-source language model evaluation framework that provides evaluation of more than 60 standard academic datasets, supports multiple evaluation modes such as HuggingFace model evaluation, PEFT adapter evaluation, and vLLM inference evaluation, and supports customized prompts and evaluation metrics, including the evaluation tasks of the loglikelihood, generate_until, and loglikelihood_rolling types. After MindSpore Transformers is adapted based on the Harness evaluation framework, the MindSpore Transformers model can be loaded for evaluation.
-
-The currently verified models and supported evaluation tasks are shown in the table below (the remaining models and evaluation tasks are actively being verified and adapted, please pay attention to version updates):
-
-| Verified models | Supported evaluation tasks                     |
-|-----------------|------------------------------------------------|
-| Llama3   | gsm8k, ceval-valid, mmlu, cmmlu, race, lambada |
-| Llama3.1 | gsm8k, ceval-valid, mmlu, cmmlu, race, lambada |
-| Qwen2    | gsm8k, ceval-valid, mmlu, cmmlu, race, lambada |
-
-### Installation
-
-Harness supports two installation methods: pip installation and source code compilation installation. Pip installation is simpler and faster, source code compilation and installation are easier to debug and analyze, and users can choose the appropriate installation method according to their needs.
-
-#### pip Installation
-
-Users can execute the following command to install Harness (Recommend using version 0.4.4):
-
-```shell
-pip install lm_eval==0.4.4
-```
-
-#### Source Code Compilation Installation
-
-Users can execute the following command to compile and install Harness:
-
-```bash
-git clone --depth 1 -b v0.4.4 https://github.com/EleutherAI/lm-evaluation-harness
-cd lm-evaluation-harness
-pip install -e .
-```
-
-### Usage
-
-#### Preparations Before Evaluation
-
-  1. Create a new directory with e.g. the name `model_dir` for storing the model yaml files.
-  2. Place the model inference yaml configuration file (predict_xxx_.yaml) in the directory created in the previous step. The directory location of the reasoning yaml configuration file for different models refers to [model library](../start/models.md).
-  3. Configure the yaml file. If the model class, model Config class, and model Tokenzier class in yaml use cheat code, that is, the code files are in [research](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research) directory or other external directories, it is necessary to modify the yaml file: under the corresponding class `type` field, add the `auto_register` field in the format of `module.class`. (`module` is the file name of the script where the class is located, and `class` is the class name. If it already exists, there is no need to modify it.).
-
-      Using [predict_1lama3_1_8b. yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/llama3_1/llama3_1_8b/predict_llama3_1_8b.yaml) configuration as an example, modify some of the configuration items as follows:
-
-      ```yaml
-      run_mode: 'predict'    # Set inference mode
-      load_checkpoint: 'model.ckpt'    # path of ckpt
-      processor:
-        tokenizer:
-          vocab_file: "tokenizer.model"    # path of tokenizer
-          type: Llama3Tokenizer
-          auto_register: llama3_tokenizer.Llama3Tokenizer
-      ```
-
-      For detailed instructions on each configuration item, please refer to the [configuration description](../appendix/conf_files.md).
-   4. If you use the `ceval-valid`, `mmlu`, `cmmlu`, `race`, and `lambada` datasets for evaluation, you need to set `use_flash_attention` to `False`. Using `predict_lama3_1_8b.yaml` as an example, modify the yaml as follow:
-
-      ```yaml
-      model:
-        model_config:
-          # ...
-          use_flash_attention: False  # Set to False
-          # ...
-       ```
-
-#### Evaluation Example
-
-Execute the script of [run_harness.sh](https://gitee.com/mindspore/mindformers/blob/r1.5.0/toolkit/benchmarks/run_harness.sh) to evaluate.
-
-The following table lists the parameters of the script of `run_harness.sh`:
-
-| Parameter           | Type | Description                                                                                                                                                                                   | Required |
-|---------------|------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|
-| `--register_path`| str | The absolute path of the directory where the cheat code is located. For example, the model directory under the [research](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research) directory. | No(The cheat code is required)     |
-| `--model`       | str  | The value must be `mf`, indicating the MindSpore Transformers evaluation policy.                                                                                                                          | Yes      |
-| `--model_args`  | str  | Model and evaluation parameters. For details, see MindSpore Transformers model parameters.                                                                                                            | Yes      |
-| `--tasks`       | str  | Dataset name. Multiple datasets can be specified and separated by commas (,).                                                                                                                 | Yes      |
-| `--batch_size`  | int  | Number of batch processing samples.                                                                                                                                                           | No       |
-
-The following table lists the parameters of `model_args`:
-
-| Parameter          | Type | Description                                                              | Required |
-|--------------|------|--------------------------------------------------------------------------|----------|
-| `pretrained`   | str  | Model directory.                                                         | Yes      |
-| `max_length`   | int  | Maximum length of model generation.                                      | No       |
-| `use_parallel` | bool | Enable parallel strategy (It must be enabled for multi card evaluation). | No       |
-| `tp`           | int  | The number of parallel tensors.                                          | No       |
-| `dp`           | int  | The number of parallel data.                                             | No       |
-
-Harness evaluation supports single-device single-card, single-device multiple-card, and multiple-device multiple-card scenarios, with sample evaluations for each scenario listed below:
-
-1. Single Card Evaluation Example
-
-   ```shell
-      source toolkit/benchmarks/run_harness.sh \
-       --register_path mindformers/research/llama3_1 \
-       --model mf \
-       --model_args pretrained=model_dir \
-       --tasks gsm8k
-   ```
-
-2. Multi Card Evaluation Example
-
-   ```shell
-      source toolkit/benchmarks/run_harness.sh \
-       --register_path mindformers/research/llama3_1 \
-       --model mf \
-       --model_args pretrained=model_dir,use_parallel=True,tp=4,dp=1 \
-       --tasks ceval-valid \
-       --batch_size BATCH_SIZE WORKER_NUM
-   ```
-
-    - `BATCH_SIZE` is the sample size for batch processing of models;
-    - `WORKER_NUM` is the number of compute devices.
-
-3. Multi-Device and Multi-Card Example
-
-   Node 0 (Master) Command:
-
-      ```shell
-         source toolkit/benchmarks/run_harness.sh \
-          --register_path mindformers/research/llama3_1 \
-          --model mf \
-          --model_args pretrained=model_dir,use_parallel=True,tp=8,dp=1 \
-          --tasks lambada \
-          --batch_size 2 8 4 192.168.0.0 8118 0 output/msrun_log False 300
-      ```
-
-   Node 1 (Secondary Node) Command:
-
-      ```shell
-         source toolkit/benchmarks/run_harness.sh \
-          --register_path mindformers/research/llama3_1 \
-          --model mf \
-          --model_args pretrained=model_dir,use_parallel=True,tp=8,dp=1 \
-          --tasks lambada \
-          --batch_size 2 8 4 192.168.0.0 8118 1 output/msrun_log False 300
-      ```
-
-   Node n (Nth Node) Command:
-
-      ```shell
-         source toolkit/benchmarks/run_harness.sh \
-          --register_path mindformers/research/llama3_1 \
-          --model mf \
-          --model_args pretrained=model_dir,use_parallel=True,tp=8,dp=1 \
-          --tasks lambada \
-          --batch_size BATCH_SIZE WORKER_NUM LOCAL_WORKER MASTER_ADDR MASTER_PORT NODE_RANK output/msrun_log False CLUSTER_TIME_OUT
-      ```
-
-   - `BATCH_SIZE` is the sample size for batch processing of models;
-   - `WORKER_NUM` is the total number of compute devices used on all nodes;
-   - `LOCAL_WORKER` is the number of compute devices used on the current node;
-   - `MASTER_ADDR` is the ip address of the primary node to be started in distributed mode;
-   - `MASTER_PORT` is the Port number bound for distributed startup;
-   - `NODE_RANK` is the Rank ID of the current node;
-   - `CLUSTER_TIME_OUT`is the waiting time for distributed startup, in seconds.
-
-   To execute the multi-node multi-device script for evaluating, you need to run the script on different nodes and set MASTER_ADDR to the IP address of the primary node. The IP address should be the same across all nodes, and only the NODE_RANK parameter varies across nodes.
-
-### Viewing the Evaluation Results
-
-After executing the evaluation command, the evaluation results will be printed out on the terminal. Taking gsm8k as an example, the evaluation results are as follows, where Filter corresponds to the way the matching model outputs results, n-shot corresponds to content format of dataset, Metric corresponds to the evaluation metric, Value corresponds to the evaluation score, and Stderr corresponds to the score error.
-
-| Tasks | Version | Filter           | n-shot | Metric      |   | Value  |   | Stderr |
-|-------|--------:|------------------|-------:|-------------|---|--------|---|--------|
-| gsm8k |       3 | flexible-extract |      5 | exact_match | ↑ | 0.5034 | ± | 0.0138 |
-|       |         | strict-match     |      5 | exact_match | ↑ | 0.5011 | ± | 0.0138 |
-
-## VLMEvalKit Evaluation
-
-### Overview
-
-[VLMEvalKit](https://github.com/open-compass/VLMEvalKit)
-is an open source toolkit designed for large visual language model evaluation, supporting one-click evaluation of large visual language models on various benchmarks, without the need for complicated data preparation, making the evaluation process easier. It supports a variety of graphic multimodal evaluation sets and video multimodal evaluation sets, a variety of API models and open source models based on PyTorch and HF, and customized prompts and evaluation metrics. After adapting MindSpore Transformers based on VLMEvalKit evaluation framework, it supports loading multimodal large models in MindSpore Transformers for evaluation.
-
-The currently adapted models and supported evaluation datasets are shown in the table below (the remaining models and evaluation datasets are actively being adapted, please pay attention to version updates):
-
-| Adapted models | Supported evaluation datasets                     |
-|--|---------------------------------------------------|
-| cogvlm2-image-llama3-chat | MME, MMBench, COCO Caption, MMMU_DEV_VAL, TextVQA_VAL |
-| cogvlm2-video-llama3-chat | MMBench-Video, MVBench                             |
-
-### Supported Feature Descriptions
-
-1. Supports automatic download of evaluation datasets;
-2. Generate results with one click.
-
-### Installation
-
-#### Downloading the Code and Compiling, Installing Dependency Packages
-
-1. Download and modify the code: Due to known issues with open source frameworks running MVBench datasets, it is necessary to modify the code by importing patch. Get [eval.patch](https://github.com/user-attachments/files/17956417/eval.patch) and download and place it in the local directory. When importing the patch, use the absolute path of the patch.
-
-    Execute the following command:
-
-    ```bash
-    git clone https://github.com/open-compass/VLMEvalKit.git
-    cd VLMEvalKit
-    git checkout 78a8cef3f02f85734d88d534390ef93ecc4b8bed
-    git apply /path/to/eval.patch
-    ```
-
-2. Install dependency packages
-
-    Find the requirements.txt (VLMEvalKit/requirements.txt) file in the downloaded code and modify it to the following content:
-
-    ```txt
-    gradio==4.40.0
-    huggingface_hub==0.24.2
-    imageio==2.35.1
-    matplotlib==3.9.1
-    moviepy==1.0.3
-    numpy==1.26.4
-    omegaconf==2.3.0
-    openai==1.3.5
-    opencv-python==4.10.0.84
-    openpyxl==3.1.5
-    pandas==2.2.2
-    peft==0.12.0
-    pillow==10.4.0
-    portalocker==2.10.1
-    protobuf==5.27.2
-    python-dotenv==1.0.1
-    requests==2.32.3
-    rich==13.7.1
-    sentencepiece==0.2.0
-    setuptools==69.5.1
-    sty==1.0.6
-    tabulate==0.9.0
-    tiktoken==0.7.0
-    timeout-decorator==0.5.0
-    torch==2.5.1
-    tqdm==4.66.4
-    transformers==4.43.3
-    typing_extensions==4.12.2
-    validators==0.33.0
-    xlsxwriter==3.2.0
-    torchvision==0.20.1
-    ```
-
-    Execute Command:
-
-    ```bash
-    pip install -r requirements.txt
-    ```
-
-#### Installing FFmpeg
-
-For Ubuntu systems follow the steps below to install:
-
-1. Update the system package list and install the system dependency libraries required for compiling FFmpeg and decode.
-
-      ```bash
-      apt-get update
-      apt-get -y install autoconf automake build-essential libass-dev libfreetype6-dev libsdl2-dev libtheora-dev libtool libva-dev libvdpau-dev libvorbis-dev libxcb1-dev libxcb-shm0-dev libxcb-xfixes0-dev pkg-config texinfo zlib1g-dev yasm libx264-dev libfdk-aac-dev libmp3lame-dev libopus-dev libvpx-dev
-      ```
-
-2. Download the compressed source code package of FFmpeg4.1.11 from the FFmpeg official website, unzip the source code package and enter the decompressed directory; Configure compilation options for FFmpeg: specify the installation path (absolute path) of FFmpeg, generate shared libraries, enable support for specific codecs, and enable no free and GPL licensed features; Compile and install FFmpeg.
-
-      ```bash
-      wget --no-check-certificate https://www.ffmpeg.org/releases/ffmpeg-4.1.11.tar.gz
-      tar -zxvf ffmpeg-4.1.11.tar.gz
-      cd ffmpeg-4.1.11
-      ./configure --prefix=/{path}/ffmpeg-xxx --enable-shared --enable-libx264 --enable-libfdk-aac --enable-libmp3lame --enable-libopus --enable-libvpx --enable-nonfree --enable-gpl
-      make && make install
-      ```
-
-Install OpenEuler system according to the following steps:
-
-1. Download the compressed source code package of FFmpeg4.1.11 from the FFmpeg official website, unzip the source code package and enter the decompressed directory; Configure compilation options for FFmpeg: specify the installation path (absolute path) for FFmpeg; Compile and install FFmpeg.
-
-      ```bash
-      wget --no-check-certificate https://www.ffmpeg.org/releases/ffmpeg-4.1.11.tar.gz
-      tar -zxvf ffmpeg-4.1.11.tar.gz
-      cd ffmpeg-4.1.11
-      ./configure --enable-shared --disable-x86asm --prefix=/path/to/ffmpeg
-      make && make install
-      ```
-
-2. Configure environment variables, `FFMPEG-PATH` requires specifying the absolute path for installing FFmpeg so that the system can correctly locate and use FFmpeg and its related libraries.
-
-      ```bash
-      vi ~/.bashrc
-      export FFMPEG_PATH=/path/to/ffmpeg/
-      export LD_LIBRARY_PATH=$FFMPEG_PATH/lib:$LD_LIBRARY_PATH
-      source ~/.bashrc
-      ```
-
-#### Installing Decord
-
-Install Ubuntu system according to the following steps:
-
-1. Pull the Decord code, enter the Decord directory, initialize and update Decord dependencies, and execute the following command:
-
-      ```bash
-      git clone https://github.com/dmlc/decord.git
-      cd decord
-      ```
-
-2. Create and enter the `build` directory, configure the compilation options for Decord, disable CUDA support, enable Release mode (optimize performance), specify the installation path for FFmpeg, and compile the Decord library. Copy the compiled libdecord.so library file to the system library directory and to the `python` directory of `decord`.
-
-      ```bash
-      mkdir build
-      cd build
-      cmake .. -DUSE_CUDA=0 -DCMAKE_BUILD_TYPE=Release -DFFMPEG_DIR=/{path}/ffmpeg-4.1.11 && make
-      cp libdecord.so /usr/local/lib/
-      cp libdecord.so ../python/decord/libdecord.so
-      ```
-
-3. Go to the python folder in the `decord` directory, install the numpy dependency, and install the python package for Decord. Add the library path (absolute path) of FFmpeg to the environment variable `LD_LIBRARY_PATH` to ensure that the runtime can find the shared library of FFmpeg.
-
-      ```bash
-      cd /path/to/decord/python
-      pip install numpy
-      python setup.py install
-      export LD_LIBRARY_PATH=/path/to/ffmpeg-4.1.11/lib/:$LD_LIBRARY_PATH
-      ```
-
-4. Execute Python commands to test if the Decord installation is successful. If there are no errors, it means the installation is successful.
-
-      ```bash
-      python -c "import decord; from decord import VideoReader"
-      ```
-
-For OpenEuler systems follow the steps below to install:
-
-1. Pull the Decord code and enter the `decord` directory.
-
-      ```bash
-      git clone --recursive https://github.com/dmlc/decord
-      cd decord
-      ```
-
-2. Create and enter the build directory, configure the compilation options for Decord, specify the installation path (absolute path) for ffmpeg, and compile the `decord` library; Enter the `python` folder in the `decord` directory, configure environment variables, and specify `PYTHONPATH`; Install the python package for Decord.
-
-      ```bash
-      mkdir build && cd build
-      cmake -DFFMPEG_DIR=/path/ffmpeg-4.1.11 ..
-      make
-      cd ../python
-      pwd=$PWD
-      echo "PYTHONPATH=$PYTHONPATH:$pwd" >> ~/.bashrc
-      source ~/.bashrc
-      python3 setup.py install
-      ```
-
-3. Execute python commands to test if the Decord installation is successful. If there are no errors, it means the installation is successful.
-
-      ```bash
-      python -c "import decord; from decord import VideoReader"
-      ```
-
-### Evaluation
-
-#### Preparations Before Evaluation
-
-1. Create a new directory, for example named `model_dir`, to store the model yaml file;
-2. Place the model inference yaml configuration file (predict_xxx_. yaml) in the directory created in the previous step. For details, Please refer to the inference content of description documents for each model in the [model library](../start/models.md);
-3. Configure the yaml file.
-
-    Using [predict_cogvlm2_image_llama3_chat_19b.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/configs/cogvlm2/predict_cogvlm2_image_llama3_chat_19b.yaml) configuration as an example:
-
-    ```yaml
-    load_checkpoint: "/{path}/model.ckpt"  # Specify the path to the weights file
-    model:
-      model_config:
-        use_past: True                         # Turn on incremental inference
-        is_dynamic: False                       # Turn off dynamic shape
-
-      tokenizer:
-        vocab_file: "/{path}/tokenizer.model"  # Specify the tokenizer file path
-    ```
-
-    Configure the yaml file. Refer to [configuration description](../appendix/conf_files.md).
-4. The MMBench-Video dataset evaluation requires the use of the GPT-4 Turbo model for evaluation and scoring. Please prepare the corresponding API Key in advance and put it in the VLMEvalKit/.env file as follows:
-
-   ```text
-   OPENAI_API_KEY=your_apikey
-   ```
-
-5. At the beginning of MVBench dataset evaluation, if you are prompted to enter the HuggingFace key, please follow the prompts to ensure the normal execution of subsequent evaluation.
-
-#### Pulling Up the Evaluation Task
-
-Execute the script in the root directory of the MindSpore Transformers local code repository: [run_vlmevalkit.sh](https://gitee.com/mindspore/mindformers/blob/r1.5.0/toolkit/benchmarks/run_vlmevalkit.sh).
-
-Execute the following command to initiate the evaluation task:
-
-```shell
-#!/bin/bash
-
-source toolkit/benchmarks/run_vlmevalkit.sh \
- --data MMMU_DEV_VAL \
- --model cogvlm2-image-llama3-chat \
- --verbose \
- --work_dir /path/to/cogvlm2-image-eval-result \
- --model_path model_dir
-```
-
-### Evaluation Parameters
-
-| Parameters      | Type  | Descriptions                                                                                                                               | Compulsory(Y/N)|
-|-----------------|-----|--------------------------------------------------------------------------------------------------------------------------------------------|------|
-| `--data`        | str | Name of the dataset, multiple datasets can be passed in, split by spaces.                                                                  | Y    |
-| `--model`       | str | Name of the model.                                                                                                                         | Y    |
-| `--verbose`     | /   | Outputs logs from the evaluation run.                                                                                                      | N    |
-| `--work_dir`    | str | Directory for storing evaluation results. By default, evaluation results are stored in the `outputs` folder of the current execution directory by default. | N    |
-| `--model_path`  | str | The folder path containing the model configuration file.                                                                                   | Y    |
-| `--register_path`| str | The absolute path of the directory where the cheat code is located. For example, the model directory under the [research](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research) directory. | No(The cheat code is required)     |
-
-If the server does not support online downloading of image datasets due to network limitations, you can upload the downloaded .tsv dataset file to the ~/LMUData directory on the server for offline evaluation. (For example: ~/LMUData/MME.tsv or ~/LMUData/MMBench_DEV_EN.tsv or ~/LMUData/COCO_VAL.tsv)
-
-### Viewing Review Results
-
-After evaluating in the above way, find the file ending in .json or .csv in the directory where the evaluation results are stored to view the evaluation results.
-
-The results of the evaluation examples are as follows, where `Bleu` and `ROUGE_L` denote the metrics for evaluating the quality of the translation, and `CIDEr` denotes the metrics for evaluating the image description task.
-
-```json
-{
-   "Bleu": [
-      15.523950970070652,
-      8.971141548228058,
-      4.702477458554666,
-      2.486860744700995
-   ],
-   "ROUGE_L": 15.575063213115946,
-   "CIDEr": 0.01734615519604295
-}
-```
-
-## Using the VideoBench Dataset for Model Evaluation
-
-### Overview
-
-[Video-Bench](https://github.com/PKU-YuanGroup/Video-Bench/tree/main) is the first comprehensive evaluation benchmark for Video-LLMs, featuring a three-level ability assessment that systematically evaluates models in video-exclusive understanding, prior knowledge incorporation, and video-based decision-making abilities.
-
-### Preparations Before Evaluation
-
-1. Download Dataset
-
-    Download [Videos of Video-Bench](https://huggingface.co/datasets/LanguageBind/Video-Bench), place it in the following directory format after decompression:
-
-    ```text
-    egs/VideoBench/
-      └── Eval_video
-            ├── ActivityNet
-            │     ├── v__2txWbQfJrY.mp4
-            │     ...
-            ├── Driving-decision-making
-            │     ├── 1.mp4
-            │     ...
-            ...
-    ```
-
-2. Download Json
-
-    Download [Jsons of Video-Bench](https://github.com/PKU-YuanGroup/Video-Bench/tree/main?tab=readme-ov-file), place it in the following directory format after decompression:
-
-    ```text
-    egs/Video-Bench/
-      └── Eval_QA
-            ├── Youcook2_QA_new.json and other json files
-            ...
-    ```
-
-3. Download the correct answers to all questions
-
-    Download [Answers of Video-Bench](https://huggingface.co/spaces/LanguageBind/Video-Bench/resolve/main/file/ANSWER.json).
-
-> Notes: The text data in Video-Bench is stored in the path format of 'egs/VideoBench/Eval-QA'(The directory should have at least two layers, and the last layer should be `EvalQA`); The video data in Video-Bench is stored in the path format of "egs/VideoBench/Eval_video"(The directory should have at least two layers, and the last layer should be `Eval_video`).
-
-### Evaluation
-
-The execution script path can refer to the link: [eval_with_videobench.py](https://gitee.com/mindspore/mindformers/blob/r1.5.0/toolkit/benchmarks/eval_with_videobench.py).
-
-#### Executing Inference Script to Obtain Inference Results
-
-```shell
-python toolkit/benchmarks/eval_with_videobench.py \
---model_path model_path \
---dataset_name dataset_name \
---Eval_QA_root Eval_QA_root \
---Eval_Video_root Eval_Video_root \
- --chat_conversation_output_folder output
-```
-
-> The parameter `Eval_QA_root` path is filled in the previous directory of Eval-QA; The parameter `Eval_Video_root` path is filled in the previous directory of Eval_video.
-
-**Parameters Description**
-
-| **Parameters**                      | **Compulsory(Y/N)** | **Description**                                                                                                 |
-|-------------------------------------|---------------------|-----------------------------------------------------------------------------------------------------------------|
-| `--model_path`                      | Y                   | The folder path for storing model related files, including model configuration files and model vocabulary files. |
-| `--dataset_name`                    | N                   | Evaluation datasets name, default to None, evaluates all subsets of VideoBench.                                 |
-| `--Eval_QA_root`                    | Y                   | Directory for storing JSON files of VideoBench dataset.                         |
-| `--Eval_Video_root`                 | Y                   | The video file directory for storing the VideoBench dataset.                                                    |
-| `--chat_conversation_output_folder` | N                   | Directory for generating result files. By default, it is stored in the Chat_desults folder of the current directory.                                                                         |
-
-After running, a dialogue result file will be generated in the chat_conversation_output_folder directory.
-
-#### Evaluating and Scoring Based on the Generated Results
-
-Video-Bench can evaluate the answers generated by the model using ChatGPT or T5, and ultimately obtain the final scores for 13 subsets of data.
-
-For example, using ChatGPT for evaluation and scoring:
-
-```shell
-python Step2_chatgpt_judge.py \
---model_chat_files_folder ./Chat_results \
---apikey sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx \
---chatgpt_judge_output_folder ./ChatGPT_Judge
-
-python Step3_merge_into_one_json.py \
---chatgpt_judge_files_folder ./ChatGPT_Judge \
---merge_file ./Video_Bench_Input.json
-```
-
-The script path in the above evaluation scoring command is: [Step2_chatgpt_judge.py](https://github.com/PKU-YuanGroup/Video-Bench/blob/main/Step2_chatgpt_judge.py), or [Step3_merge_into_one_json.py](https://github.com/PKU-YuanGroup/Video-Bench/blob/main/Step3_merge_into_one_json.py).
-
-Since ChatGPT may answer some formatting errors, you need to run below Step2_chatgpt_judge.py multiple times to ensure that each question is validated by chatgpt.
-
-## FAQ
-
-1. Use Harness or VLMEvalKit for evaluation, when loading the HuggingFace datasets, report `SSLError`:
-
-   Refer to [SSL Error reporting solution](https://stackoverflow.com/questions/71692354/facing-ssl-error-with-huggingface-pretrained-models).
-
-   Note: Turning off SSL verification is risky and may be exposed to MITM. It is only recommended to use it in the test environment or in the connection you fully trust.
-
-2. An `AssertionError` occurs when MVBench dataset is used in VLMEvalKit for evaluation:
-
-   Because the open source framework `VLMEvalKit` has known problems when running `MVBench` dataset. Modify the file by referring to the [issue](https://github.com/open-compass/VLMEvalKit/issues/888) of the open-source framework, or delete the files generated during the evaluation and run the command again (specified by the `--work_dir` parameter, in the `outputs` folder of the current execution directory by default).
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_en/usage/inference.md b/docs/mindformers/docs/source_en/usage/inference.md
deleted file mode 100644
index f2a3aaac6d20fe0acf8080561e0e3823151962e7..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_en/usage/inference.md
+++ /dev/null
@@ -1,361 +0,0 @@
-# Inference
-
-[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_en/usage/inference.md)
-
-## Overview
-
-MindSpore Transformers provides the foundation model inference capability. Users can run the unified script `run_mindformer` or write a script to call the high-level API to start inference. If the unified script `run_mindformer` is used, you can directly start the system through the configuration file without writing code.
-
-## Basic Process
-
-The inference process can be categorized into the following steps:
-
-### 1. Models of Selective Inference
-
-Depending on the required inference task, different models are chosen, e.g. for text generation one can choose Llama2, etc.
-
-### 2. Preparing Model Weights
-
-Model weights can be categorized into two types: complete weights and distributed weights, and the following instructions should be referred to when using them.
-
-#### 2.1 Complete Weights
-
-Complete weights can be obtained in two ways:
-
-1. After downloading the open source weights of the corresponding model from the HuggingFace model library, refer to [Weight Format Conversion](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/function/weight_conversion.html) to convert them to the ckpt format.
-2. Pre-trained or fine-tuned distributed weights are used to generate a complete weight by [merging](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/function/transform_weight.html).
-
-#### 2.2 Distributed Weights
-
-Distributed weights are typically obtained by pre-training or after fine-tuning and are stored by default in the `./output/checkpoint_network` directory, which needs to be converted to single-card or multi-card weights before performing single-card or multi-card inference.
-
-If the inference uses a weight slicing that is different from the model slicing provided in the inference task, such as in these cases below, the weights need to be additionally converted to a slice that matches the slicing of the model in the actual inference task.
-
-1. The weights obtained from multi-card training are reasoned on a single card;
-2. The weights of the eight-card training are reasoned over two cards;
-3. Already sliced distributed weights are reasoned on a single card, and so on.
-
-The command samples in the following contents are all used in the way of online autoslicing. It is recommended to use online autoslicing by setting the command parameters `--auto_trans_ckpt` to `-True` and `-src_strategy_path_or_dir` to the weighted slicing strategy file or directory path (which is saved by default after training under `./output/strategy`) are automatically sliced in the inference task. Details can be found in [Distributed Weight Slicing and Merging](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/function/transform_weight.html).
-
-> Since both the training and inference tasks use `./output` as the default output path, when using the strategy file output by the training task as the source weight strategy file for the inference task, you need to move the strategy file directory under the default output path to another location to avoid it being emptied by the process of the inference task, for example:
->
-> ```mv ./output/strategy/ ./strategy```
-
-### 3. Executing Inference Tasks
-
-Call the high-level API or use the unified script `run_mindformer` to execute inference tasks.
-
-## Inference Based on the run_mindformer Script
-
-For single-device inference, you can directly run [run_mindformer.py](https://gitee.com/mindspore/mindformers/blob/r1.5.0/run_mindformer.py). For multi-device inference, you need to run [scripts/msrun_launcher.sh](https://gitee.com/mindspore/mindformers/blob/r1.5.0/scripts/msrun_launcher.sh).
-
-The arguments to run_mindformer.py are described below:
-
-| Parameters               | Parameter Descriptions                                                                                                                             |
-|:-------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------|
-| config                   | Path to the yaml configuration file                                                                                                                |
-| run_mode                 | The running mode, with inference set to predict                                                                                                    |
-| use_parallel             | Whether to use multicard inference                                                                                                                 |
-| load_checkpoint          | the loaded weight path                                                                                                                             |
-| predict_data             | Input data for inference. Multi-batch inference needs to pass the path to the txt file of the input data, which contains multiple lines of inputs. |
-| auto_trans_ckpt          | Automatic weight slicing. Default value is False                                                                                                   |
-| src_strategy_path_or_dir | Path to the strategy file for weights                                                                                                              |
-| predict_batch_size       | batch_size for multi-batch inference                                                                                                               |
-| modal_type               | Given modal type corresponds to predict data in multimodal inference scenario.                                                                     |
-
-msrun_launcher.sh includes the run_mindformer.py command and the number of inference cards as two parameters.
-
-The following will describe the usage of single and multi-card inference using Llama2 as an example, with the recommended configuration of the [predict_llama2_7b.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/configs/llama2/predict_llama2_7b.yaml) file.
-
-> During inference, the vocabulary file `tokenizer.model` required for the Llama2 model will be automatically downloaded (ensuring smooth network connectivity). If the file exists locally, you can place it in the `./checkpoint_download/Llama2/` directory in advance.
-
-### Single-Device Inference
-
-When using complete weight inference, the following command is executed to start the inference task:
-
-```shell
-python run_mindformer.py \
---config configs/llama2/predict_llama2_7b.yaml \
---run_mode predict \
---use_parallel False \
---load_checkpoint path/to/checkpoint.ckpt \
---predict_data 'I love Beijing, because'
-```
-
-If you use distributed weight files for inference, you need to add the `--auto_trans_ckpt` and `-src_strategy_path_or_dir` entries, with the following startup commands:
-
-```shell
-python run_mindformer.py \
---config configs/llama2/predict_llama2_7b.yaml \
---run_mode predict \
---use_parallel False \
---auto_trans_ckpt True \
---src_strategy_path_or_dir ./strategy \
---load_checkpoint path/to/checkpoint.ckpt \
---predict_data 'I love Beijing, because'
-```
-
-The following result appears, proving that the inference was successful. The inference result is also saved to the `text_generation_result.txt` file in the current directory. The detailed log can be viewed in the `./output/msrun_log` directory.
-
-```text
-'text_generation_text': [I love Beijing, because it is a city that is constantly constantly changing. I have been living here for ......]
-```
-
-### Multi-Card Inference
-
-The configuration requirements for multi-card inference differ from those of single card, and you need to refer to the following instructions to modify the [predict_llama2_7b.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/configs/llama2/predict_llama2_7b.yaml) configuration.
-
-1. The configuration of model_parallel and the number of cards used need to be consistent. The following use case is 2-card inference, and model_parallel needs to be set to 2;
-2. The current version of multi-card inference does not support data parallelism, you need to set data_parallel to 1.
-
-**Configuration before modification:**
-
-```yaml
-parallel_config:
-  data_parallel: 8
-  model_parallel: 1
-  pipeline_stage: 1
-```
-
-**Configuration after modifications:**
-
-```yaml
-parallel_config:
-  data_parallel: 1
-  model_parallel: 2
-  pipeline_stage: 1
-```
-
-When full weight inference is used, you need to enable the online slicing mode to load weights. For details, see the following command:
-
-```shell
-bash scripts/msrun_launcher.sh "python run_mindformer.py \
---config configs/llama2/predict_llama2_7b.yaml \
---run_mode predict \
---use_parallel True \
---auto_trans_ckpt True \
---load_checkpoint path/to/checkpoint.ckpt \
---predict_data 'I love Beijing, because'" \
-2
-```
-
-Refer to the following commands when distributed weight inference is used and the slicing strategy for the weights is the same as the slicing strategy for the model:
-
-```shell
-bash scripts/msrun_launcher.sh "python run_mindformer.py \
---config configs/llama2/predict_llama2_7b.yaml \
---run_mode predict \
---use_parallel True \
---load_checkpoint path/to/checkpoint_dir \
---predict_data 'I love Beijing, because'" \
-2
-```
-
-When distributed weight inference is used and the slicing strategy of the weights is not consistent with the slicing strategy of the model, you need to enable the online slicing function to load the weights. Refer to the following command:
-
-```shell
-bash scripts/msrun_launcher.sh "python run_mindformer.py \
---config configs/llama2/predict_llama2_7b.yaml \
---run_mode predict \
---use_parallel True \
---auto_trans_ckpt True \
---src_strategy_path_or_dir ./strategy \
---load_checkpoint path/to/checkpoint_dir \
---predict_data 'I love Beijing, because'" \
-2
-```
-
-Inference results are viewed in the same way as single-card inference.
-
-### Multi-Device Multi-Batch Inference
-
-Multi-card multi-batch inference is initiated in the same way as [multi-card inference](#multi-card-inference), but requires the addition of the `predict_batch_size` inputs and the modification of the `predict_data` inputs.
-
-The content and format of the `input_predict_data.txt` file is an input each line, and the number of questions is the same as the `predict_batch_size`, which can be found in the following format:
-
-```txt
-I love Beijing, because
-I love Beijing, because
-I love Beijing, because
-I love Beijing, because
-```
-
-Refer to the following commands to perform inference tasks, taking the full weight inference as an example:
-
-```shell
-bash scripts/msrun_launcher.sh "python run_mindformer.py \
---config configs/llama2/predict_llama2_7b.yaml \
---run_mode predict \
---predict_batch_size 4 \
---use_parallel True \
---auto_trans_ckpt True \
---load_checkpoint path/to/checkpoint.ckpt \
---predict_data path/to/input_predict_data.txt" \
-2
-```
-
-Inference results are viewed in the same way as single-card inference.
-
-### Multimodal Inference
-
-Use `cogvlm2-llama3-chat-19B` model as example and see the following process with details:
-
-Modify configuration yaml file [predict_cogvlm2_image_llama3_chat_19b.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/configs/cogvlm2/predict_cogvlm2_image_llama3_chat_19b.yaml).
-
-```shell
-model:
-  model_config:
-    use_past: True                         # Turn on incremental inference
-    is_dynamic: False                      # Turn off dynamic shape
-
-  tokenizer:
-    vocab_file: "/{path}/tokenizer.model"  # Specify the tokenizer file path
-```
-
-Run inference scripts.
-
-```shell
-python run_mindformer.py \
- --config configs/cogvlm2/predict_cogvlm2_image_llama3_chat_19b.yaml \
- --run_mode predict \
- --predict_data "/path/image.jpg" "Please describe this image." \  # input data,first input is image path,second input is text path.
- --modal_type image text \                                         # modal type for input data, 'image' type for image path, 'text' type for text path.
- --load_checkpoint /{path}/cogvlm2-image-llama3-chat.ckpt
-```
-
-## Inference Based on High-level Interface
-
-> For security reasons, it is not recommended to use high-level interfaces for inference. This chapter will be deprecated in the next version. If you have any questions or suggestions, please submit feedback through [Community Issue](https://gitee.com/mindspore/mindformers/issues/new). Thank you for your understanding and support!
-
-MindSpore Transformers not only provides a unified script for `run_mindformer` inference, but also supports user-defined calls to high-level interfaces such as `pipeline` or `chat` for implementation.
-
-### Pipeline Interface
-
-Customized text generation inference task flow based on `pipeline` interface, supporting single card inference and multi-card inference. About how to use `pipeline` interface to start the task and output the result, you can refer to the following implementation. The specific parameter description can be viewed [pipeline interface API documentation](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/mindformers/mindformers.pipeline.html#mindformers.pipeline).
-
-#### Incremental Inference
-
-```python
-from mindformers import build_context
-from mindformers import AutoModel, AutoTokenizer, pipeline, TextStreamer
-
-# Construct the input content.
-inputs = ["I love Beijing, because", "LLaMA is a", "Huawei is a company that"]
-
-# Initialize the environment.
-build_context({'context': {'mode': 0}, 'run_mode': 'predict', 'parallel': {}, 'parallel_config': {}})
-
-# Instantiate a tokenizer.
-tokenizer = AutoTokenizer.from_pretrained('llama2_7b')
-
-# Instantiate a model.
-# Modify the path to the local weight path.
-model = AutoModel.from_pretrained('llama2_7b', checkpoint_name_or_path="path/to/llama2_7b.ckpt", use_past=True)
-# Model instantiation is also supported from modelers.cn.Given repo id which format is MindSpore-Lab/model_name
-# model = AutoModel.from_pretrained('MindSpore-Lab/qwen1_5_7b-chat')
-
-# Start a non-stream inference task in the pipeline.
-text_generation_pipeline = pipeline(task="text_generation", model=model, tokenizer=tokenizer)
-outputs = text_generation_pipeline(inputs, max_length=512, do_sample=False, top_k=3, top_p=1)
-for output in outputs:
-    print(output)
-```
-
-Save the example to `pipeline_inference.py`, modify the path for loading the weight, and run the `pipeline_inference.py` script.
-
-```shell
-python pipeline_inference.py
-```
-
-The inference result is as follows:
-
-```text
-'text_generation_text': [I love Beijing, because it is a city that is constantly constantly changing. I have been living here for ......]
-'text_generation_text': [LLaMA is a large-scale, open-source, multimodal, multilingual, multitask, and multimodal pretrained language model. It is ......]
-'text_generation_text': [Huawei is a company that has been around for a long time. ......]
-```
-
-#### Stream Inference
-
-```python
-from mindformers import build_context
-from mindformers import AutoModel, AutoTokenizer, pipeline, TextStreamer
-
-# Construct the input content.
-inputs = ["I love Beijing, because", "LLaMA is a", "Huawei is a company that"]
-
-# Initialize the environment.
-build_context({'context': {'mode': 0}, 'run_mode': 'predict', 'parallel': {}, 'parallel_config': {}})
-
-# Instantiate a tokenizer.
-tokenizer = AutoTokenizer.from_pretrained('llama2_7b')
-
-# Instantiate a model.
-# Modify the path to the local weight path.
-model = AutoModel.from_pretrained('llama2_7b', checkpoint_name_or_path="path/to/llama2_7b.ckpt", use_past=True)
-# Model instantiation is also supported from modelers.cn.Given repo id which format is MindSpore-Lab/model_name
-# model = AutoModel.from_pretrained('MindSpore-Lab/qwen1_5_7b-chat')
-
-# Start a stream inference task in the pipeline.
-streamer = TextStreamer(tokenizer)
-text_generation_pipeline = pipeline(task="text_generation", model=model, tokenizer=tokenizer, streamer=streamer)
-_ = text_generation_pipeline(inputs, max_length=512, do_sample=False, top_k=3, top_p=1)
-```
-
-Save the example to `pipeline_inference.py`, modify the path for loading the weight, and run the `pipeline_inference.py` script.
-
-```shell
-python pipeline_inference.py
-```
-
-The inference result is as follows:
-
-```text
-'text_generation_text': [I love Beijing, because it is a city that is constantly constantly changing. I have been living here for ......]
-'text_generation_text': [LLaMA is a large-scale, open-source, multimodal, multilingual, multitask, and multimodal pretrained language model. It is ......]
-'text_generation_text': [Huawei is a company that has been around for a long time. ......]
-```
-
-### chat Interface
-
-Based on the `chat` interface, the process of generating dialogue text inference tasks involves adding chat templates through the provided tokenizer to infer user queries. You can refer to the following implementation methods, and specific parameter descriptions can be viewed [chat interface API documentation](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/generation/mindformers.generation.GenerationMixin.html#mindformers.generation.GenerationMixin.chat).
-
-```python
-from mindformers import build_context
-from mindformers import AutoModel, AutoTokenizer
-
-# Construct the input content.
-query = "Hello!"
-
-# Initialize the environment.
-build_context({'context': {'mode': 0}, 'run_mode': 'predict', 'parallel': {}, 'parallel_config': {}})
-
-# Instantiate a tokenizer.
-tokenizer = AutoTokenizer.from_pretrained('llama2_7b')
-
-# Instantiate a model.
-# Modify the path to the local weight path.
-model = AutoModel.from_pretrained('llama2_7b', checkpoint_name_or_path="path/to/llama2_7b.ckpt", use_past=True)
-# Model instantiation is also supported from modelers.cn.Given repo id which format is MindSpore-Lab/model_name
-# model = AutoModel.from_pretrained('MindSpore-Lab/qwen1_5_7b-chat')
-
-# Start a stream inference task with chat.
-response, history = model.chat(tokenizer=tokenizer, query=query, max_length=32)
-print(response)
-```
-
-Save the example to `chat_inference.py`, modify the path for loading the weight, and run the `chat_inference.py` script.
-
-```shell
-python chat_inference.py
-```
-
-The inference result is as follows:
-
-```text
-Thanks, sir.
-```
-
-## More Information
-
-For more inference examples of different models, see [the models supported by MindSpore Transformers](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/start/models.html).
diff --git a/docs/mindformers/docs/source_en/usage/mindie_deployment.md b/docs/mindformers/docs/source_en/usage/mindie_deployment.md
deleted file mode 100644
index ed7819cf927a61343a3f75321b8ae03c27a218e5..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_en/usage/mindie_deployment.md
+++ /dev/null
@@ -1,349 +0,0 @@
-# Service Deployment
-
-[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_en/usage/mindie_deployment.md)
-
-## Introduction
-
-MindIE, full name Mind Inference Engine, is a high-performance inference framework based on Ascend hardware. For more information, please refer to [Official Document](https://www.hiascend.com/software/mindie).
-
-MindSpore Transformers are hosted in the model application layer MindIE LLM, and large models in MindSpore Transformers can be deployed through MindIE Service.
-
-The model support for MindIE inference can be found in [model repository](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/start/models.html).
-
-## Environment Setup
-
-### Software Installation
-
-1. Install MindSpore Transformers
-
-   Refer to [MindSpore Transformers Official Installation Guide](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/quick_start/install.html) for installation.
-
-2. Install MindIE
-
-   Refer to [MindIE Installation Dependencies Documentation](https://www.hiascend.com/document/detail/zh/mindie/100/envdeployment/instg/mindie_instg_0010.html) to complete the dependency installation. After that, go to [MindIE Resource Download Center](https://www.hiascend.com/developer/download/community/result?module=ie%2Bpt%2Bcann) to download the package and install it.
-
-   MindIE and CANN versions must be matched, version matching relationship is as follows.
-
-   |                                           MindIE                                            |                                        CANN-toolkit                                         |                                        CANN-kernels                                         |
-   |:-------------------------------------------------------------------------------------------:|:-------------------------------------------------------------------------------------------:|:-------------------------------------------------------------------------------------------:|
-   | [1.0.0](https://www.hiascend.com/developer/download/community/result?module=ie%2Bpt%2Bcann) | [8.0.0](https://www.hiascend.com/developer/download/community/result?module=ie%2Bpt%2Bcann) | [8.0.0](https://www.hiascend.com/developer/download/community/result?module=ie%2Bpt%2Bcann) |
-
-### Environment Variables
-
-If the installation path is the default path, you can run the following command to initialize the environment variables of each component.
-
-```bash
-# Ascend
-source /usr/local/Ascend/ascend-toolkit/set_env.sh
-# MindIE
-source /usr/local/Ascend/mindie/latest/mindie-llm/set_env.sh
-source /usr/local/Ascend/mindie/latest/mindie-service/set_env.sh
-# MindSpore
-export LCAL_IF_PORT=8129
-# Networking Configuration
-export MS_SCHED_HOST=127.0.0.1     # scheduler node ip address
-export MS_SCHED_PORT=8090          # Scheduler node service port
-```
-
-> If there are other cards on the machine that have MindIE enabled, you need to be aware of any conflicts with the `MS_SCHED_PORT` parameter. If you get an error on this parameter in the log printout, try again with a different port number.
-
-## Basic Process of Inference Service Deployment
-
-### Preparing Model Files
-
-Create a folder for the specified model related files in the MindIE backend, such as model tokenizer files, yaml configuration files and config files.
-
-```bash
-mkdir -p mf_model/qwen1_5_72b
-```
-
-Taking Qwen1.5-72B as an example, the folder directory structure is as follows:
-
-```reStructuredText
-mf_model
- └── qwen1_5_72b
-        ├── config.json                 # Model json configuration file, corresponding model download on Hugging Face
-        ├── vocab.json                  # Model vocab file, corresponding model download on Hugging Face
-        ├── merges.txt                  # Model merges file, corresponding model download on Hugging Face
-        ├── predict_qwen1_5_72b.yaml    # Model yaml configuration file
-        ├── qwen1_5_tokenizer.py        # Model tokenizer file, copy the corresponding model from the search directory in the mindformers repository
-        └── qwen1_5_72b_ckpt_dir        # Model distributed weight folder
-```
-
-predict_qwen1_5_72b.yaml needs to be concerned with the following configuration:
-
-```yaml
-load_checkpoint: '/mf_model/qwen1_5_72b/qwen1_5_72b_ckpt_dir' # Path to the folder that holds the model distributed weight
-use_parallel: True
-auto_trans_ckpt: False    # Whether to enable automatic weight conversion, with offline splitting set to False
-parallel_config:
-  data_parallel: 1
-  model_parallel: 4       # Multi-card inference configures the model splitting, which generally corresponds to the number of cards used
-  pipeline_parallel: 1
-processor:
-  tokenizer:
-    vocab_file: "/path/to/mf_model/qwen1_5_72b/vocab.json"  # vocab file absolute path
-    merges_file: "/path/to/mf_model/qwen1_5_72b/merges.txt"  # merges file absolute path
-```
-
-For model weight downloading and conversions, refer to the [Weight Format Conversion Guide](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/function/weight_conversion.html).
-
-Required files and configurations may vary from model to model. Refer to the model-specific inference sections in [Model Repository](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/start/models.html) for details.
-
-### Starting MindIE
-
-#### 1. One-click Start (Recommended)
-
-The mindformers repository provides a one-click pull-up MindIE script with preconfigured environment variable settings and servitization configurations, which allows you to quickly pull up the service by simply entering the directory of the model file.
-
-Go to the `scripts` directory and execute the MindIE startup script:
-
-```shell
-cd ./scripts
-bash run_mindie.sh --model-name xxx --model-path /path/to/model
-
-# Parameter descriptions
---model-name: Mandatory, set MindIE backend name
---model-path: Mandatory, set model folder path, such as /path/to/mf_model/qwen1_5_72b
---help      : Instructions for using the script
-```
-
-View logs:
-
-```bash
-tail -f output.log
-```
-
-When `Daemon start success!` appears in the log, it means the service started successfully.
-
-#### 2. Customized Startup
-
-The MindIE installation paths are all the default paths `/usr/local/Ascend/.` If you customize the installation path, synchronize the path in the following example.
-
-Open config.json in the mindie-service directory and modify the server-related configuration.
-
-```bash
-vim /usr/local/Ascend/mindie/latest/mindie-service/conf/config.json
-```
-
-where `modelWeightPath` and `backendType` must be modified to configure:
-
-```bash
-"modelWeightPath": "/path/to/mf_model/qwen1_5_72b"
-"backendType": "ms"
-```
-
-`modelWeightPath` is the model folder created in the previous step, where model and tokenizer and other related files are placed; `backendType` backend startup method is `ms`.
-
-Other relevant parameters are as follows:
-
-| Optional Configurations          | Value Type | Range of Values             | Configuration Descriptions                                                                                                                       |
-| ------------------- | -------- | -------------------- |----------------------------------------------------------------------------------------------------------------------------|
-| httpsEnabled        | Bool     | True/False           | Whether to enable HTTPS communication security authentication, the default is True. Easy to start, it is recommended to set to False.  |
-| maxSeqLen           | int32    | Customized by user requirements, >0 | MaxSeqLen. Length of input + length of output <= maxSeqLen, user selects maxSeqLen according to inference scenario                                                                       |
-| npuDeviceIds        | list     | Customization by model requirements     | This configuration item is temporarily disabled. The actual running card is controlled by the visible card environment variable and the worldSize configuration. Resource reference needs to be adjusted by visible card according to [CANN Environment Variables](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha003/apiref/envref/envref_07_0029.html).                         |
-| worldSize           | int32    | Customization by model requirements     | The number of cards used for the visible card. Example: ASCEND_RT_VISIBLE_DEVICES=4,0,1,2 and worldSize=2, then take the 4th, 0th card to run.    |
-| npuMemSize          | int32    | Customization by Video Memory         | The upper limit of the size (GB) that can be used to request KVCache in the NPU can be calculated according to the actual size of the deployment model: npuMemSize=(total free - weight/mp number)*factor, where the factor is taken as 0.8. Recommended value: 8.                                    |
-| cpuMemSize          | int32    |  Customization by Memory         | The upper limit of the size (GB) that can be used to request KVCache in CPU is related to the swap function, and the Cache will be released for recalculation when cpuMemSize is insufficient. Recommended value: 5.                                                   |
-| maxPrefillBatchSize | int32    | [1, maxBatchSize]    | Maximum prefill batch size. maxPrefillBatchSize and maxPrefillTokens will complete the batch if they reach their respective values first. This parameter is mainly used in scenarios where there is a clear need to limit the batch size of the prefill phase, otherwise it can be set to 0 (at this point, the engine will take the maxBatchSize value by default) or the same as maxBatchSize. Required, default value: 50. |
-| maxPrefillTokens    | int32    | [5120, 409600]       | At each prefill, the total number of all input tokens in the current batch must not exceed maxPrefillTokens. maxPrefillTokens and maxPrefillBatchSize will complete the current group batch if they reach their respective values first. Required, default value: 8192.                                                                                |
-| maxBatchSize        | int32    | [1, 5000]            | Maximum decode batch size, estimated based on model size and NPU graphics memory.                                                                                       |
-| maxIterTimes        | int32    | [1, maxSeqLen-1]     | The number of decodes that can be performed, i.e. the maximum length of a sentence that can be generated. There is a max_output_length parameter inside the request level, maxIterTimes is a global setting, and max_output_length is taken as the maximum length of the final output.         |
-
-The full set of configuration parameters is available in [MindIE Service Developer's Guide - Quick Start - Configuration Parameter Descriptions](https://www.hiascend.com/document/detail/zh/mindie/10RC3/mindieservice/servicedev/mindie_service0285.html).
-
-Run the startup script:
-
-```bash
-cd /path/to/mindie/latest/mindie-service
-nohup ./bin/mindieservice_daemon > output.log 2>&1 &
-tail -f output.log
-```
-
-When `Daemon start success!` appears in the log, it means the service started successfully.
-
-The related logs of Python:
-
-```bash
-export MINDIE_LLM_PYTHON_LOG_TO_FILE=1
-export MINDIE_LLM_PYTHON_LOG_PATH=/usr/local/Ascend/mindie/latest/mindie-service/logs/pythonlog.log
-tail -f /usr/local/Ascend/mindie/latest/mindie-service/logs/pythonlog.log
-```
-
-## MindIE Service Deployment and Inference Example
-
-The following example installs each component to the default path `/usr/local/Ascend/.` and the model uses `Qwen1.5-72B`.
-
-### Preparing Model Files
-
-Take Qwen1.5-72B as an example to prepare the model file directory. For details of the directory structure and configuration, refer to [Preparing Model Files](#preparing-model-files):
-
-```bash
-mkdir -p mf_model/qwen1_5_72b
-```
-
-### Starting MindIE
-
-#### 1. One-click Start (Recommended)
-
-Go to the `scripts` directory and execute the mindie startup script:
-
-```shell
-cd ./scripts
-bash run_mindie.sh --model-name qwen1_5_72b --model-path /path/to/mf_model/qwen1_5_72b
-```
-
-View log:
-
-```bash
-tail -f output.log
-```
-
-When `Daemon start success!` appears in the log, it means the service started successfully.
-
-#### 2. Customized Startup
-
-Open config.json in the mindie-service directory and modify the server-related configuration.
-
-```bash
-vim /usr/local/Ascend/mindie/latest/mindie-service/conf/config.json
-```
-
-The final modified config.json is as follows:
-
-```json
-{
-    "Version" : "1.0.0",
-    "LogConfig" :
-    {
-        "logLevel" : "Info",
-        "logFileSize" : 20,
-        "logFileNum" : 20,
-        "logPath" : "logs/mindservice.log"
-    },
-
-    "ServerConfig" :
-    {
-        "ipAddress" : "127.0.0.1",
-        "managementIpAddress" : "127.0.0.2",
-        "port" : 1025,
-        "managementPort" : 1026,
-        "metricsPort" : 1027,
-        "allowAllZeroIpListening" : false,
-        "maxLinkNum" : 1000,
-        "httpsEnabled" : false,
-        "fullTextEnabled" : false,
-        "tlsCaPath" : "security/ca/",
-        "tlsCaFile" : ["ca.pem"],
-        "tlsCert" : "security/certs/server.pem",
-        "tlsPk" : "security/keys/server.key.pem",
-        "tlsPkPwd" : "security/pass/key_pwd.txt",
-        "tlsCrl" : "security/certs/server_crl.pem",
-        "managementTlsCaFile" : ["management_ca.pem"],
-        "managementTlsCert" : "security/certs/management/server.pem",
-        "managementTlsPk" : "security/keys/management/server.key.pem",
-        "managementTlsPkPwd" : "security/pass/management/key_pwd.txt",
-        "managementTlsCrl" : "security/certs/management/server_crl.pem",
-        "kmcKsfMaster" : "tools/pmt/master/ksfa",
-        "kmcKsfStandby" : "tools/pmt/standby/ksfb",
-        "inferMode" : "standard",
-        "interCommTLSEnabled" : false,
-        "interCommPort" : 1121,
-        "interCommTlsCaFile" : "security/grpc/ca/ca.pem",
-        "interCommTlsCert" : "security/grpc/certs/server.pem",
-        "interCommPk" : "security/grpc/keys/server.key.pem",
-        "interCommPkPwd" : "security/grpc/pass/key_pwd.txt",
-        "interCommTlsCrl" : "security/certs/server_crl.pem",
-        "openAiSupport" : "vllm"
-    },
-
-    "BackendConfig" : {
-        "backendName" : "mindieservice_llm_engine",
-        "modelInstanceNumber" : 1,
-        "npuDeviceIds" : [[0,1,2,3]],
-        "tokenizerProcessNumber" : 8,
-        "multiNodesInferEnabled" : false,
-        "multiNodesInferPort" : 1120,
-        "interNodeTLSEnabled" : true,
-        "interNodeTlsCaFile" : "security/grpc/ca/ca.pem",
-        "interNodeTlsCert" : "security/grpc/certs/server.pem",
-        "interNodeTlsPk" : "security/grpc/keys/server.key.pem",
-        "interNodeTlsPkPwd" : "security/grpc/pass/mindie_server_key_pwd.txt",
-        "interNodeTlsCrl" : "security/grpc/certs/server_crl.pem",
-        "interNodeKmcKsfMaster" : "tools/pmt/master/ksfa",
-        "interNodeKmcKsfStandby" : "tools/pmt/standby/ksfb",
-        "ModelDeployConfig" :
-        {
-            "maxSeqLen" : 8192,
-            "maxInputTokenLen" : 8192,
-            "truncation" : false,
-            "ModelConfig" : [
-                {
-                    "modelInstanceType" : "Standard",
-                    "modelName" : "Qwen1.5-72B-Chat",
-                    "modelWeightPath" : "/mf_model/qwen1_5_72b",
-                    "worldSize" : 4,
-                    "cpuMemSize" : 15,
-                    "npuMemSize" : 15,
-                    "backendType" : "ms"
-                }
-            ]
-        },
-
-        "ScheduleConfig" :
-        {
-            "templateType" : "Standard",
-            "templateName" : "Standard_LLM",
-            "cacheBlockSize" : 128,
-
-            "maxPrefillBatchSize" : 50,
-            "maxPrefillTokens" : 8192,
-            "prefillTimeMsPerReq" : 150,
-            "prefillPolicyType" : 0,
-
-            "decodeTimeMsPerReq" : 50,
-            "decodePolicyType" : 0,
-
-            "maxBatchSize" : 200,
-            "maxIterTimes" : 4096,
-            "maxPreemptCount" : 0,
-            "supportSelectBatch" : false,
-            "maxQueueDelayMicroseconds" : 5000
-        }
-    }
-}
-```
-
-> For testing purposes, the `httpsEnabled` parameter is set to `false`, ignoring subsequent https communication related parameters.
-
-Go to the mindie-service directory to start the service:
-
-```bash
-cd /usr/local/Ascend/mindie/1.0.RC3/mindie-service
-nohup ./bin/mindieservice_daemon > output.log 2>&1 &
-tail -f output.log
-```
-
-The following message is printed, indicating that the startup was successful.
-
-```bash
-Daemon start success!
-```
-
-### Request Test
-
-After the service has started successfully, you can use the curl command to send a request for verification, as shown in the following example:
-
-```bash
-curl -w "\ntime_total=%{time_total}\n" -H "Accept: application/json" -H "Content-type: application/json" -X POST -d '{"inputs": "I love Beijing, because","stream": false}' http://127.0.0.1:1025/generate
-```
-
-The validation is successful with the following returned inference result:
-
-```json
-{"generated_text":" it is a city with a long history and rich culture....."}
-```
-
-## Model List
-
-Examples of MindIE inference for other models can be found in the introduction documentation for each model in [Model Library](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/start/models.html).
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_en/usage/multi_modal.md b/docs/mindformers/docs/source_en/usage/multi_modal.md
deleted file mode 100644
index bd5550c94d58d9ea19f4be8b19ceaa843e1d24ba..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_en/usage/multi_modal.md
+++ /dev/null
@@ -1,329 +0,0 @@
-# Multimodal Model Development
-
-[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_en/usage/multi_modal.md)
-
-Multimodal models refer to artificial intelligence models capable of processing and combining information from different modalities (such as text, images, audio, video, etc.) for learning and inference. Traditional single-modality models typically focus on a single type of data, such as text classification models handling only text data or image recognition models handling only image data. In contrast, multimodal models integrate data from different sources to accomplish more complex tasks, enabling them to understand and generate richer and more comprehensive content.
-
-This document aims to introduce the multimodal models in MindSpore Transformers, providing detailed steps and examples to guide users in building custom multimodal models and data processing modules using MindSpore Transformers. Additionally, users can follow the document to complete tasks such as model training and inference.
-
-The unified architecture of multimodal models in **MindSpore Transformers** primarily includes the following components:
-
-- [Dataset Construction](#dataset-construction)
-- [Data Processing Modules](#data-processing-modules)
-- [Model Construction](#model-construction)
-    - [Model Configuration Class](#model-configuration-class)
-    - [Non-text Modality Processing Module](#non-text-modality-processing-module)
-    - [Cross-Modal Interaction Module](#cross-modal-interaction-module)
-    - [Text Generation Module](#text-generation-module)
-- [Multimodal Model Practice](#multimodal-model-practice)
-
-## Dataset Construction
-
-Before training a multimodal model, it is often necessary to first construct a multimodal dataset. MindSpore Transformers currently provides `dataset` and `dataloader` classes for multimodal data, which users can directly utilize:
-
-- [BaseMultiModalDataLoader](https://gitee.com/mindspore/mindformers/blob/r1.5.0/mindformers/dataset/dataloader/multi_modal_dataloader.py) is the multimodal dataset loading class. It handles the functionality of reading data from a `json` file.
-- [ModalToTextSFTDataset](https://gitee.com/mindspore/mindformers/blob/r1.5.0/mindformers/dataset/modal_to_text_sft_dataset.py) is the multimodal dataset processing class. It handles multimodal data processing, as well as operations like batch processing and data repetition. For more details on multimodal data processing, refer to the [Data Processing Modules](#data-processing-modules).
-
-Below is an example of part of the training dataset `json` file for the `CogVLM2-Video` model:
-
-```json
-[{
-    "id": "v_p1QGn0IzfW0.mp4",
-    "conversations": [
-      {
-        "from": "user",
-        "value": "<|reserved_special_token_3|>/path/VideoChatGPT/convert/v_p1QGn0IzfW0.mp4<|reserved_special_token_4|>What equipment is visible in the gym where the boy is doing his routine?"
-      },
-      {
-        "from": "assistant",
-        "value": "There is other equipment visible in the gym like a high bar and still rings."
-      }
-    ]
-}]
-```
-
-In the dataset, `<|reserved_special_token_3|>` and `<|reserved_special_token_3|>` are placeholders used to represent video paths in the `CogVLM2-Video` model.
-
-Users can construct custom `json` files as needed. The file format should be a list containing multiple dictionaries, where each dictionary represents a data sample. In each sample, the `id` field denotes the data identifier, and the `conversations` field represents the multi-turn conversation content.
-
-After constructing the `json` file, you can run the following example code to view the data samples from the dataset:
-
-```python
-from mindformers.dataset.dataloader.multi_modal_dataloader import BaseMultiModalDataLoader
-
-# build data loader
-dataset_loader = BaseMultiModalDataLoader(
-  annotation_file = '/path/dataset.json', shuffle=False
-)
-print(dataset_loader[0])
-
-# ([['user', '<|reserved_special_token_3|>/path/VideoChatGPT/convert/v_p1QGn0IzfW0.mp4<|reserved_special_token_4|>What equipment is visible in the gym where the boy is doing his routine?'], ['assistant', 'There is other equipment visible in the gym like a high bar and still rings.']],)
-```
-
-## Data Processing Modules
-
-During the training and inference of multimodal models, the data processing modules are required to perform preprocessing on multimodal data. This module is invoked during training in the ModalToTextSFTDataset, and during inference in the [MultiModalToTextPipeline](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/pipeline/mindformers.pipeline.MultiModalToTextPipeline.html#mindformers.pipeline.MultiModalToTextPipeline).
-
-Below is a flowchart of the multimodal data processing. The custom modules in the diagram need to be implemented by the user according to their specific requirements, while other modules can be directly invoked.
-
-![multi_modal.png](../../source_zh_cn/usage/image/multi_modal.png)
-
-Then, using the [CogVLM2-Video model data preprocessing module](https://gitee.com/mindspore/mindformers/blob/r1.5.0/mindformers/models/cogvlm2/cogvlm2_processor.py) as an example, we will introduce the functionality of the components of the multimodal data processing module.
-
-1. BaseXModalToTextProcessor is mainly used to receive raw multimodal data for inference and perform preprocessing operations. It also implements post-processing operations for inference results, and users can directly use this class.
-2. BaseXModalToTextTransform is mainly used to process the data returned by `BaseXModalToTextProcessor` or the multimodal dataset into data suitable for inference or training. This class can also be directly used by users.
-3. [ModalContentTransformTemplate](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/models/mindformers.models.multi_modal.ModalContentTransformTemplate.html#mindformers.models.multi_modal.ModalContentTransformTemplate) is the abstract class for all modality-specific data construction modules. Since data operations are model-dependent, users need to implement corresponding custom data construction classes based on their needs. In the `CogVLM2-Video` model, the `CogVLM2ContentTransformTemplate` class is implemented to handle both video and text data.
-4. ModalContentBuilder is the abstract class for single-modality data processing. If the model needs to handle data from multiple modalities, corresponding single-modality data processing classes need to be created during the initialization of the custom data construction class. In the `CogVLM2-Video` model, the `CogVLM2VideoContentBuilder` class is implemented to handle video data, while the general text data processing class `BaseTextContentBuilder` is used to process text data.
-
-Below is an example of the data preprocessing code for training and inference in the `CogVLM2-Video` model.
-
-### Model Training Data Processing
-
-In multimodal model training tasks, data preprocessing configurations are typically written in the `train_dataset` section. The following is an example of the dataset-related configuration in the `CogVLM2-Video` model training configuration file:
-
-[finetune_cogvlm2_video_llama3_chat_13b_lora.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/configs/cogvlm2/finetune_cogvlm2_video_llama3_chat_13b_lora.yaml)
-
-```yaml
-train_dataset: &train_dataset
-  data_loader:
-    type: BaseMultiModalDataLoader
-    annotation_file: "/path/train_data.json"
-    shuffle: True
-  modal_to_text_transform:
-    type: BaseXModalToTextTransform
-    max_length: 2048
-    model_transform_template:
-      type: CogVLM2ContentTransformTemplate
-      output_columns: [ "input_ids", "images", "video_context_pos", "position_ids", "labels" ]
-      signal_type: "chat"
-      mode: 'train'
-      pos_pad_length: 2048
-  tokenizer:
-    add_bos_token: False
-    add_eos_token: False
-    max_length: 2048
-    pad_token: "<|reserved_special_token_0|>"
-    vocab_file: "/path/tokenizer.model"
-    type: CogVLM2Tokenizer
-```
-
-The `annotation_file` is the path to the training data's `json` file. Both `modal_to_text_transform` and `tokenizer` should be similar to those in the `processor` section of the inference configuration.
-
-```python
-from mindformers.tools.register.config import MindFormerConfig
-from mindformers.dataset.modal_to_text_sft_dataset import ModalToTextSFTDataset
-
-# load configs
-configs = MindFormerConfig("configs/cogvlm2/finetune_cogvlm2_video_llama3_chat_13b_lora.yaml")
-# build dataset
-multi_modal_dataset = ModalToTextSFTDataset(**configs.train_dataset)
-# iterate dataset
-for item in multi_modal_dataset:
-    print(len(item))
-    break
-# 5, output 5 columns
-```
-
-### Model Inference Data Processing
-
-The data processing module configuration in the `CogVLM2-Video` model inference configuration file is as follows:
-
-[predict_cogvlm2_video_llama3_chat_13b.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/configs/cogvlm2/predict_cogvlm2_video_llama3_chat_13b.yaml)
-
-```yaml
-processor:
-  type: BaseXModalToTextProcessor
-  model_transform_template:
-    type: CogVLM2ContentTransformTemplate
-    output_columns: [ "input_ids", "position_ids", "images", "video_context_pos" ]
-    vstack_columns: [ "images", "video_context_pos" ]
-    signal_type: "chat"
-    pos_pad_length: 2048
-  tokenizer:
-    add_bos_token: False
-    add_eos_token: False
-    max_length: 2048
-    pad_token: "<|reserved_special_token_0|>"
-    vocab_file: "/path/tokenizer.model"
-    type: CogVLM2Tokenizer
-```
-
-The `vocab_file` is the path to the vocabulary file used, while other parameters are related to the model configuration and can be customized as needed by the user.
-
-Below is an example code for processing multimodal training data. Unlike the training data, the data processing yields a dictionary containing processed data such as `input_ids`, rather than a list.
-
-```python
-from mindformers.tools.register.config import MindFormerConfig
-from mindformers.models.multi_modal.base_multi_modal_processor import BaseXModalToTextProcessor
-from mindformers.models.cogvlm2.cogvlm2_tokenizer import CogVLM2Tokenizer
-
-# build processor
-configs = MindFormerConfig("configs/cogvlm2/predict_cogvlm2_video_llama3_chat_13b.yaml")
-configs.processor.tokenizer = tokenizer = CogVLM2Tokenizer(**configs.processor.tokenizer)
-processor = BaseXModalToTextProcessor(**configs.processor)
-
-# process data
-multi_modal_data = [
-  {'video': "/path/video.mp4"},
-  {'text': "Please describe this video."}
-]
-
-print(processor(multi_modal_data).keys())
-# dict_keys(['input_ids', 'position_ids', 'images', 'video_context_pos'])
-```
-
-After implementing the multimodal dataset construction and data processing modules, the data that can be handled by the multimodal model can be obtained. Below, we will introduce how to construct a multimodal large model.
-
-## Model Construction
-
-A multimodal large model typically consists of three parts: a non-text modality processing module, a cross-modal interaction module, and a text generation module. The non-text modality processing module is usually a vision model pre-trained on large-scale data, the text generation module is typically a large text generation model, and the cross-modal interaction module usually consists of multiple linear layers.
-
-### Model Configuration Class
-
-In MindSpore Transformers, the parameters related to multimodal models are mainly controlled through the model configuration class. Below, we use the `CogVLM2Config` class as an example to explain how to build the model configuration class.  
-For the specific implementation, refer to [CogVLM2Config](https://gitee.com/mindspore/mindformers/blob/r1.5.0/mindformers/models/cogvlm2/cogvlm2_config.py).
-
-```python
-@MindFormerRegister.register(MindFormerModuleType.CONFIG)
-class CogVLM2Config(PretrainedConfig):
-    def __init__(self,
-                 vision_model: PretrainedConfig,
-                 llm_model: PretrainedConfig,
-                 **kwargs):
-        super().__init__(**kwargs)
-        self.vision_model = vision_model
-        self.llm_model = llm_model
-```
-
-Parameter Explanation:
-
-1. `@MindFormerRegister.register(MindFormerModuleType.CONFIG)` is mainly used to register a custom model configuration class. Once registered, the model configuration class can be called by its name in the `yaml` file.
-2. `vision_model` and `llm_model` represent the configuration classes for the vision model and text generation model, respectively. They are passed as parameters to the multimodal model configuration class and processed during the class initialization.
-3. `PretrainedConfig` is the base class for all model configurations. For more details, refer to [PretrainedConfig](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/models/mindformers.models.PretrainedConfig.html#mindformers.models.PretrainedConfig).
-
-In the configuration file, the model should be configured as follows.  
-For the specific implementation, refer to [predict_cogvlm2_video_llama3_chat_13b.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/configs/cogvlm2/predict_cogvlm2_video_llama3_chat_13b.yaml).
-
-```yaml
-model:
-  model_config:
-    type: MultiModalConfig
-    vision_model:
-      arch:
-        type: EVAModel
-      model_config:
-        type: EVA02Config
-        image_size: 224
-        patch_size: 14
-        hidden_size: 1792
-        num_hidden_layers: 63
-        ...
-    llm_model:
-      arch:
-        type: CogVLM2VideoLM
-      model_config:
-        type: LlamaConfig
-        seq_length: 2048
-        hidden_size: 4096
-        num_layers: 32
-        ...
-  arch:
-    type: CogVLM2ForCausalLM
-```
-
-In this configuration file, `EVAModel` and `EVA02Config` are used as the `vision_model` and its configuration class, while `CogVLM2VideoLM` and `LlamaConfig` are used as the `llm_model` and its configuration class.  
-Together, they form the multimodal model `CogVLM2ForCausalLM`. These classes are all pre-implemented modules in MindSpore Transformers. Below, we will explain how to implement custom modules.
-
-### Non-Text Modality Processing Module
-
-MindSpore Transformers provides models like `ViT` and `EVA02` as visual information processing modules. Below, we use the `EVA02` model as an example to explain how to construct a non-text modality processing module.  
-For more details, refer to [EVAModel](https://gitee.com/mindspore/mindformers/blob/r1.5.0/mindformers/models/eva02/eva.py) and [EVA02Config](https://gitee.com/mindspore/mindformers/blob/r1.5.0/mindformers/models/eva02/eva_config.py).
-
-```python
-from mindformers.tools.register import MindFormerRegister, MindFormerModuleType
-from mindformers.models.modeling_utils import PreTrainedModel
-from mindformers.models.eva02.eva_config import EVA02Config
-
-class EVA02PreTrainedModel(PreTrainedModel):
-    config_class = EVA02Config
-    base_model_prefix = "eva02"
-
-@MindFormerRegister.register(MindFormerModuleType.MODELS)
-class EVAModel(EVA02PreTrainedModel):
-    def __init__(self, config=None):
-        config = config if config else EVA02Config()
-        super().__init__(config)
-```
-
-Parameter Explanation:
-
-1. `@MindFormerRegister.register(MindFormerModuleType.MODELS)` is mainly used to register a custom model class. Once registered, the model class can be called by its name in the `yaml` file.
-2. `EVA02PreTrainedModel` inherits from the `PreTrainedModel` class and is mainly used to specify the model configuration class and the prefix for model parameter names. `EVAModel` is the specific implementation of the model, inheriting from the `EVA02PreTrainedModel` class. For more details, refer to the [PreTrainedModel](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/models/mindformers.models.PreTrainedModel.html#mindformers.models.PreTrainedModel) API.
-3. `EVAModel` mainly processes visual information in the data and feeds the processed visual features into the **cross-modal interaction module**.
-
-### Cross-Modal Interaction Module
-
-The text generation module is usually a pre-trained large language model, while the non-text modality processing module is a model pre-trained on large-scale non-text data. The output features from these models differ significantly from those in the text features and cannot be directly input into the text generation module for inference. Therefore, a cross-modal interaction module, matching the text generation module, is needed to process visual features into vectors that can be handled by the text generation module.
-
-Below, we use the `VisionMLPAdapter` in the `CogVLM2-Video` model as an example to explain the structure and function of the cross-modal interaction module.
-
-```python
-class VisionMLPAdapter(nn.Cell):
-    def __init__(self, vision_grid_size, vision_hidden_size, text_hidden_size, text_intermediate_size,
-                 compute_dtype=ms.float16, param_init_type=ms.float16):
-        super().__init__()
-        self.grid_size = vision_grid_size
-        self.linear_proj = GLU(in_features=vision_hidden_size,
-                               hidden_size=text_hidden_size,
-                               intermediate_size=text_intermediate_size,
-                               compute_dtype=compute_dtype, param_init_type=param_init_type)
-        self.conv = nn.Conv2d(in_channels=vision_hidden_size, out_channels=vision_hidden_size,
-                              kernel_size=2, stride=2, dtype=param_init_type, has_bias=True).to_float(compute_dtype)
-```
-
-In the `VisionMLPAdapter`, the output of the `EVAModel` is processed through operations such as Linear and Conv2D to match the same dimensionality as the text features. Here, `vision_hidden_size` and `text_hidden_size` represent the dimensionalities of the visual and text features, respectively.
-
-### Text Generation Module
-
-MindSpore Transformers provides large language models such as `Llama2` and `Llama3` as text generation modules, which, together with the non-text modality processing module and cross-modal interaction module, form the multimodal model.
-
-```python
-@MindFormerRegister.register(MindFormerModuleType.MODELS)
-class MultiModalForCausalLM(BaseXModalToTextModel):
-    def __init__(self, config: MultiModalConfig, **kwargs):
-        super().__init__(config, **kwargs)
-        self.config = config
-        self.vision_model = build_network(config.vision_model)
-        self.llm_model = build_network(config.llm_model)
-        self.mlp_adapter = VisionMLPAdapter(**kwargs)
-
-    def prepare_inputs_for_generation(self, input_ids, **kwargs):
-      """Prepare inputs for generation in inference."""
-
-    def prepare_inputs_for_predict_layout(self, input_ids, **kwargs):
-      """Prepare inputs for generation in inference."""
-
-    def set_dynamic_inputs(self, **kwargs):
-      """Set dynamic inputs for model."""
-
-    def construct(self, input_ids, **kwargs):
-      """Model forward."""
-```
-
-Parameter Explanation:
-
-1. `MultiModalForCausalLM`, as the multimodal model class, inherits from the base class `BaseXModalToTextModel`. During the construction of this class, the `build_network` function and the corresponding module configurations are used to initialize the non-text modality processing module `vision_model`, the text generation module `llm_model`, and the cross-modal interaction module `VisionMLPAdapter`.
-2. The `prepare_inputs_for_generation` method preprocesses the input data, ensuring that the processed data can be used for model inference through the `construct` method.
-3. The `prepare_inputs_for_predict_layout` method constructs data that the model can handle. Its return value corresponds to the input parameters of the `construct` method, and the constructed data allows for model compilation.
-4. The `set_dynamic_inputs` method configures dynamic shapes for some input data in the model.
-5. The `construct` method is the common interface for all models and serves as the forward execution function for the multimodal model.
-
-## Multimodal Model Practice
-
-After implementing the multimodal dataset, data processing modules, and multimodal model construction, you can start model pre-training, fine-tuning, inference, and other tasks by using the model configuration file. This requires creating the corresponding model configuration file.
-
-For specific model configuration files, refer to [predict_cogvlm2_video_llama3_chat_13b.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/configs/cogvlm2/predict_cogvlm2_video_llama3_chat_13b.yaml) and [finetune_cogvlm2_video_llama3_chat_13b_lora.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/configs/cogvlm2/finetune_cogvlm2_video_llama3_chat_13b_lora.yaml), which correspond to model inference and fine-tuning, respectively. For the meaning of specific parameters, refer to the [configuration file documentation](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/appendix/conf_files.html).
-
-In the user-defined configuration file, sections such as `model`, `processor`, and `train_dataset` need to correspond to the user's custom **dataset**, **data processing module**, and **multimodal model**.
-
-After editing the custom configuration file, refer to the [CogVLM2-Video model documentation](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/cogvlm2_video.md) to start model [inference](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/cogvlm2_video.md#推理) and [fine-tuning](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/cogvlm2_video.md#微调) tasks.
diff --git a/docs/mindformers/docs/source_en/usage/pre_training.md b/docs/mindformers/docs/source_en/usage/pre_training.md
deleted file mode 100644
index ca684d4f8cd27c28bd5ae87602529f207d2a619e..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_en/usage/pre_training.md
+++ /dev/null
@@ -1,89 +0,0 @@
-# Pretraining
-
-[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_en/usage/pre_training.md)
-
-## Overview
-
-Pretraining refers to training a model on large-scale unlabeled data, so that the model can comprehensively capture a wide range of features of a language. A pretrained model can learn knowledge at the vocabulary, syntax, and semantic levels. After fine-tuning, the knowledge is applied in downstream tasks to optimize the performance of specific tasks. The objective of the MindSpore Transformers framework pretraining is to help developers quickly and conveniently build and train pretrained models based on the Transformer architecture.
-
-## Procedure
-
-Based on actual operations, the basic pretraining process can be divided into the following steps:
-
-1. **Preparing a dataset:**
-   Prepare a large-scale unlabeled text dataset for pretraining. Such datasets contain a large amount of text from multiple sources, such as networks, books, and articles. The diversity and scale of datasets have a great impact on the generalization capability of models.
-
-2. **Selecting a model architecture:**
-   Select a proper model architecture to build a pretrained model based on task requirements and computing resources.
-
-3. **Pretraining:**
-   Perform pretraining with the prepared large-scale dataset and use the configured model architecture and training configuration to perform long-time training to generate the final pretrained model weight.
-
-4. **Saving a model:**
-   After the training is complete, save the model weight to the specified location.
-
-## MindSpore Transformers-based Pretraining Practice
-
-Currently, MindSpore Transformers supports mainstream foundation models in the industry. In this practice, Llama2-7B and Llama3-70B are used to demonstrate [Single-Node Training](#single-node-training) and [Multi-Node Training](#multi-node-training), respectively.
-
-### Preparing a Dataset
-
-| Dataset  |    Applicable Model   |   Applicable Phase  |                                      Download Link                                      |
-|:--------|:----------:|:--------:|:-------------------------------------------------------------------------------:|
-| Wikitext2 | Llama2-7B  | Pretrain | [Link](https://ascend-repo-modelzoo.obs.cn-east-2.myhuaweicloud.com/MindFormers/dataset/wikitext-2/wikitext-2-v1.zip) |
-| Wiki103 | Llama3-70B | Pretrain |    [Link](https://dagshub.com/DagsHub/WIkiText-103/src/main/dataset/tokens)     |
-
-### Data Preprocessing
-
-For details about how to process the Llama2-7B and Llama3-70B datasets, see [the Wikitext2 data preprocessing](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/llama2.md) and [the Wiki103 data preprocessing](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/llama3/README.md), respectively.
-
-## Executing a Pretrained Task
-
-### Single-Node Training
-
-Take Llama2-7B as an example. Specify the configuration file [pretrain_llama2_7b.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/configs/llama2/pretrain_llama2_7b.yaml) and start the [run_mindformer.py](https://gitee.com/mindspore/mindformers/blob/r1.5.0/run_mindformer.py) script in msrun mode to perform 8-device distributed training. The startup command is as follows:
-
-```bash
-bash scripts/msrun_launcher.sh "run_mindformer.py \
- --config configs/llama2/pretrain_llama2_7b.yaml \
- --train_dataset_dir /{path}/wiki4096.mindrecord \
- --use_parallel True \
- --run_mode train" 8
-
- # Parameters:
- config:            model configuration file, which is stored in the config directory of the MindSpore Transformers code repository.
- train_dataset_dir: path of the training dataset.
- use_parallel:      specifies whether to enable parallelism.
- run_mode:          running mode. The value can be train, finetune, or predict (inference).
- ```
-
-After the task is executed, the **checkpoint** folder is generated in the **mindformers/output** directory, and the model file is saved in this folder.
-
-### Multi-Node Training
-
-Take Llama3-70B as an example. Use the [pretrain_llama3_70b.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/llama3/llama3_70b/pretrain_llama3_70b.yaml) configuration file to run [run_mindformer.py](https://gitee.com/mindspore/mindformers/blob/r1.5.0/run_mindformer.py) in msrun mode to perform 8-node 64-device pretraining. To perform distributed training on a multi-node multi-device script, you need to run the script on different nodes and set the **MASTER_ADDR** parameter to the IP address of the primary node. The IP addresses of all nodes are the same, and only the values of **NODE_RANK** are different for different nodes. For details about the parameter positions, see [msrun Launching Guide](https://www.mindspore.cn/tutorials/en/r2.6.0/parallel/msrun_launcher.html).
-
-```shell
-# Node 0: Set the IP address of node 0 to the value of MASTER_ADDR, which is used as the IP address of the primary node. There are 64 devices in total with 8 devices for each node.
-# Change the value of node_num for nodes 0 to 7 in sequence. For example, if there are eight nodes, the value of node_num ranges from 0 to 7.
-bash scripts/msrun_launcher.sh "run_mindformer.py \
- --register_path research/llama3 \
- --config research/llama3/llama3_70b/pretrain_llama3_70b.yaml \
- --train_dataset dataset_dir \
- --use_parallel True \
- --run_mode train" \
- 64 8 {MASTER_ADDR} 8118 {node_num} output/msrun_log False 300
-
- # Parameters:
- register_path:     The registered path of the model API is a directory path that contains Python scripts of the model (can be the path of the model folder in the 'research' directory).
- config:            model configuration file, which is stored in the config directory of the MindSpore Transformers code repository.
- train_dataset_dir: path of the training dataset.
- use_parallel:      specifies whether to enable parallelism.
- run_mode:          running mode. The value can be train, finetune, or predict (inference).
-```
-
-**Note**: During multi-node distributed training, some performance problems may occur. To ensure the efficiency and stability of the training process, you are advised to optimize and adjust the performance by referring to [Large Model Performance Optimization Guide](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/perf_optimize/perf_optimize.html).
-
-## More Information
-
-For more training examples of different models, see [the models supported by MindFormers](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/start/models.html).
diff --git a/docs/mindformers/docs/source_en/usage/pretrain_gpt.md b/docs/mindformers/docs/source_en/usage/pretrain_gpt.md
deleted file mode 100644
index 1b4c27ab0b9167f751c31b09b22370418ddb316d..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_en/usage/pretrain_gpt.md
+++ /dev/null
@@ -1,505 +0,0 @@
-# Dynamic Graph Parallelism
-
-[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_en/usage/pretrain_gpt.md)
-
-## Overview
-
-This tutorial demonstrates how to use MindSpore Transformers dynamic graph parallel framework to train GPT models. This framework supports tensor parallel, pipeline parallel, sequence parallel and other parallel scenarios, as well as support for the use of distributed optimizer dynamic learning rate and other scenarios, to help developers quickly and easily build and train GPT pre-training models based on dynamic graph parallel framework.
-
-## Operating Practice
-
-The following GPT model training is based on Ascend platform.
-
-### Sample Code Reference
-
-The directory structure is as follows:
-
-```text
-└─ gpt
-    ├─ pretrain_gpt.py
-    ├─ pretrain_gpt.sh
-    └─ pretrain_gpt_7B.yaml
-    ...
-```
-
-Among them, `pretrain_gpt.py` is the script for environment configuration, model object creation and training. `pretrain_gpt.sh` is the startup execution script. `pretrain_gpt_7B.yaml` is the configuration item.
-
-### Model Structure
-
-GPT uses the `Transformer` model as its main architecture, and the network structure is mainly built around the basic building blocks of the `Transformer`.
-
-In the model, five parameters are initialized, `config` is the model configuration item (in the `model_config` of the yaml file), `num_tokentypes` specifies the type of embedding, `parallel_output` is used to confirm whether to output the output of each parallel Tensor, `pre_ process` and `post_process` specify whether it is the first and last stage, respectively.
-
-The called `get_language_model` is an interface based on the `Transformer` model, see the api documentation for `get_language_model` for details.
-
-Note: The dataset return values are to correspond to the parameters required by the forward process defined by the model.
-
-```python
-from mindformers.experimental.parallel_core.pynative.transformer.module import Module
-from mindformers.experimental.parallel_core.pynative.transformer.language_model import get_language_model
-from mindformers.experimental.parallel_core.pynative.transformer import ParallelLMLogits
-from mindformers.experimental.parallel_core.pynative.training.loss_func import VocabParallelCrossEntropy
-
-
-class AttnMaskType(enum.Enum):
-    padding = 1
-    causal = 2
-    no_mask = 3
-    padding_causal = 4
-
-
-attn_mask_type_mapping = {
-    "padding": AttnMaskType.padding,
-    "causal": AttnMaskType.causal,
-}
-
-
-class GPTModel(Module):
-    def __init__(self,
-                 config,
-                 num_tokentypes=0,
-                 parallel_output=True,
-                 pre_process=True,
-                 post_process=True):
-        super().__init__(config=config,\
-                         share_embeddings_and_output_weights=not config.untie_embeddings_and_output_weights)
-
-        self.parallel_output = parallel_output
-        self.pre_process = pre_process
-        self.post_process = post_process
-        self.untie_embeddings_and_output_weights = config.untie_embeddings_and_output_weights
-        self.fp16_lm_cross_entropy = config.fp16_lm_cross_entropy
-
-        self.set_model_key()
-        encoder_attn_mask_type = None
-        if config.encoder_attn_mask_type is not None:
-            encoder_attn_mask_type = attn_mask_type_mapping.get(config.encoder_attn_mask_type)
-            if encoder_attn_mask_type is None:
-                raise ValueError(f"encoder_attn_mask_type must be one of {attn_mask_type_mapping.keys()}, but got"
-                                 f"{config.encoder_attn_mask_type}")
-
-        self.language_model, self._language_model_key = get_language_model(
-            config=config,
-            num_tokentypes=num_tokentypes,
-            add_pooler=False,
-            encoder_attn_mask_type=encoder_attn_mask_type,
-            pre_process=self.pre_process,
-            post_process=self.post_process)
-
-        if self.post_process:
-            self.parallel_lm_logits = ParallelLMLogits(config=config,
-                                                       bias=False,
-                                                       compute_dtype=config.compute_dtype)
-            self.loss = VocabParallelCrossEntropy()
-
-        if not config.untie_embeddings_and_output_weights:
-            self.initialize_word_embeddings()
-
-    def set_input_tensor(self, input_tensor):
-        """ set input_tensor to model """
-        self.language_model.set_input_tensor(input_tensor)
-
-    def set_model_key(self):
-        """ set model key for differentiate PipelineCell process """
-        self.model_key = "gpt3"
-
-    def construct(self, input_ids, position_ids, attention_mask, loss_mask,
-                  retriever_input_ids=None,
-                  retriever_position_ids=None,
-                  retriever_attn_mask=None,
-                  labels=None, tokentype_ids=None, inference_params=None):
-        """ gpt model forward """
-        # use RoPE
-        position_ids = None
-        retriever_input_ids = None
-        retriever_position_ids = None
-        retriever_attn_mask = None
-        lm_output = self.language_model(
-            input_ids,
-            position_ids,
-            attention_mask,
-            retriever_input_ids=retriever_input_ids,
-            retriever_position_ids=retriever_position_ids,
-            retriever_attn_mask=retriever_attn_mask,
-            inference_params=inference_params)
-        if self.post_process:
-            return post_language_model_processing(
-                self.parallel_lm_logits, self.loss,
-                lm_output, labels,
-                self.language_model.output_layer.weight if\
-                    self.untie_embeddings_and_output_weights else self.shared_embedding_or_output_weight(),
-                self.parallel_output,
-                self.fp16_lm_cross_entropy,
-                loss_mask)
-        else:
-            return lm_output
-```
-
-When `post_process` is set to `True`, the output `lm_output` of the language model needs to be post-processed to output losses and predictions.
-
-```python
-import mindspore.common.dtype as mstype
-
-def post_language_model_processing(parallel_lm_logits, loss_fn, lm_output, labels, logit_weights,
-                                   parallel_output, fp16_lm_cross_entropy, loss_mask):
-    """ gpt model post process forward """
-    output = parallel_lm_logits(lm_output, logit_weights, parallel_output)
-
-    if labels is None:
-        return output
-
-    labels = labels
-    loss_mask = loss_mask.reshape(-1)
-
-    if fp16_lm_cross_entropy:
-        if output.dtype != mstype.float16:
-            raise ValueError(f"When fp16_lm_cross_entropy=True, output should be float16, but got {output.dtype}")
-        loss = loss_fn(output, labels, loss_mask)
-    else:
-        loss = loss_fn(output.astype(mstype.float32), labels)
-    token_nums = loss_mask.sum()
-    loss_mask = loss_mask.astype(mstype.float32)
-    loss = ops.sum(loss * loss_mask.float()) / loss_mask.sum()
-    return loss, output, token_nums
-```
-
-### Dynamic Graph Parallel Training Configuration
-
-Configuration items for dynamic graph parallel are read through a yaml file and are categorized into different types, including training configuration, parallel configuration, and model configuration. The next section briefly describes the basic configurations needed for large model training.
-
-#### training_config
-
-```yaml
-training_config:
-  seed: 42                                        # Seeds for fixed randomness
-  output_dir: './output'                          # Output directory for storing checkpoints, logs, etc.
-  training_iters: 10                              # The number of training iterations
-  log_interval: 1                                 # Frequency of log prints
-  save_interval: null                             # Frequency of storing checkpoints
-  loss_scale: 4096                                # Initial value of loss scale
-  grad_clip_kwargs:
-    grad_clip_type: "ClipGlobalNorm"              # Gradient cropping methods, optional: "ClipGlobalNorm" or  "GradClipByValue"
-    clip_value: 1.0
-  loss_reduction: "mean"                          # loss reduction methods, optional: "mean" or "sum"
-  loss_func_kwargs:
-    loss_func_type: "VocabParallelCrossEntropy"   # Loss function, optional: "VocabParallelCrossEntropy" or "CrossEntropyLoss"
-  use_distributed_optimizer: True                 # Whether to use a distributed optimizer
-```
-
-#### parallel_config
-
-```yaml
-parallel_config:
-  tensor_model_parallel_size: 1                    # Tensor parallel
-  pipeline_model_parallel_size: 1                  # Pipeline parallel
-  expert_model_parallel_size: 1                    # Expert parallel
-  virtual_pipeline_model_parallel_size: null       # Virtual pipeline parallel
-  sequence_parallel: False                         # Sequence parallel
-```
-
-#### gpt_config
-
-```yaml
-model_config:
-  params_dtype: "float32"                          # Parameter initialization type
-  compute_dtype: "bfloat16"                        # Types used in calculations
-  position_embedding_type: 'rope'                  # Type of location code, optional: "rope" or "absolute"
-  untie_embeddings_and_output_weights: True        # Whether the embedding layer and the head layer do not share weights
-  # Configure the GPT 7B model
-  num_layers: 6                                    # The number of Transformer layers
-  hidden_size: 4096                                # Size of the hidden layer
-  ffn_hidden_size: 11008                           # Size of feedforward neural network hidden layer
-  num_attention_heads: 32                          # Number of attention heads
-```
-
-The GPT model is currently available in three different sizes of configurations: 7B, 13B and 70B.
-
-```yaml
-7B:
-  num_layers: 32
-  hidden_size: 4096
-  ffn_hidden_size: 11008
-  num_attention_heads: 32
-13B:
-  num_layers: 40
-  hidden_size: 5120
-  ffn_hidden_size: 13824
-  num_attention_heads: 40
-70B:
-  num_layers: 80
-  hidden_size: 8192
-  ffn_hidden_size: 28672
-  num_attention_heads: 64
-  group_query_attention: True
-  num_query_groups: 8
-```
-
-#### dataset_config
-
-```yaml
-dataset_config:
-  batch_size: 1                                    # Size of data removed from the dataset in one iteration
-  micro_batch_num: 2                               # Number of micro batches
-  dataset_dir: './dataset'                         # Catalog where the dataset is located
-  shuffle: False                                   # Whether to break the order
-```
-
-#### optimizer_config
-
-```yaml
-optimizer_config:
-  optimizer_type: "AdamW"                          # Optimizer types, optional: "AdamW", "Adam", "SGD", "Came", "mint.AdamW" and "SpeedAdamW"
-  betas:                                           # Optimizer input parameters
-    - 0.9
-    - 0.95
-  eps: 1.e-8
-  learning_rate: 1.25e-6                           # Initial learning rate
-  weight_decay: 1.e-1                              # Weight decay factor
-  learning_rate_scheduler_kwargs:                  # Learning rate adjustment strategy
-    warmup_steps: 200
-    decay_steps: 2000
-    use_cosine: True
-    end_learning_rate: 1.25e-7
-```
-
-### Model Training Configuration Parsing
-
-The passing yaml configuration file is parsed in pretrain_gpt.py to get the training configuration, model configuration, optimizer configuration, parallel strategy configuration, and dataset configuration.
-
-```python
-import argparse
-from mindformers.experimental.parallel_core.pynative.config import (
-    init_configs_from_yaml
-)
-
-def get_arg_parser():
-    """get argument parser"""
-    parser = argparse.ArgumentParser(description="Train gpt model")
-    parser.add_argument("--config_path", type=str, default="pretrain_gpt.yaml", help="The path to the config file.")
-    parser.add_argument("--run_cmd", type=str, default="", help="running cmd.")
-    parser.add_argument("--model_type", type=str, default="gpt_config", help="Input model config.")
-    return parser
-parser = get_arg_parser()
-args = parser.parse_args()
-
-all_config = init_configs_from_yaml(args.config_path)
-
-training_config = all_config.training_config
-model_config = all_config.model_config
-optimizer_config = all_config.optimizer_config
-parallel_config = all_config.parallel_config
-dataset_config = all_config.dataset_config
-```
-
-### Communication Configuration
-
-The set_context interface allows you to specify the run mode, run device, and run card number. The parallel script also needs to specify the parallel mode `parallel_mode` as the data parallel mode and initialize the HCCL, NCCL or MCCL communication through init depending on the different device requirements. Specify platform: set `device_target` to `Ascend`. You can use `set_context(pynative_synchronize=True)` in debugging phase to enable synchronization mode and locate the error report location more accurately.
-
-```python
-import mindspore as ms
-
-
-def set_parallel_context(parallel_config):
-    init()
-    initialize_model_parallel(
-        tensor_model_parallel_size=parallel_config.tensor_model_parallel_size,
-        pipeline_model_parallel_size=parallel_config.pipeline_model_parallel_size,
-        virtual_pipeline_model_parallel_size=parallel_config.virtual_pipeline_model_parallel_size,
-    )
-    logger.info(
-        f"dp {get_data_parallel_world_size()} | "
-        f"pp {parallel_config.pipeline_model_parallel_size} | "
-        f"tp {parallel_config.tensor_model_parallel_size} | "
-        f"sp {parallel_config.sequence_parallel} | "
-        f"vpp {parallel_config.virtual_pipeline_model_parallel_size}"
-    )
-
-
-def set_seed(seed):
-    # set global seed, np seed, and dataset seed
-    ms.set_seed(seed)
-    # set rng seed
-    ms.manual_seed(seed)
-
-
-ms.set_context(mode=ms.PYNATIVE_MODE)
-ms.set_device(device_target="Ascend")
-set_parallel_context(parallel_config)
-set_seed(training_config.seed)
-```
-
-### Creating Network Objects
-
-Get the GPT model from the model library and create a network model object based on the configuration file. Set different weight decay coefficients for different parameters via `set_weight_decay`, a function that divides the parameters into two groups, one with a specific weight decay value applied and the other with a weight decay of `0`, and returns a list containing information about the grouping of the parameters assigned to the `group_params` variable. The `get_optimizer` function is called, passing in `optimizer_config` (optimizer configuration), `training_config` (training configuration), `group_params` (information about the grouping of parameters obtained earlier), `network_with_loss` (an object containing the model and loss ), and a gradient reduction operation (obtained from `training_config.loss_reduction`) that returns an optimizer object and assigns it to the `optimizer` variable.
-Create a `TrainOneStepCell` object, which is typically used to perform one-step optimization during training. Pass `network_with_loss`, `optimizer` and configuration as parameters and assign them to the train_one_step_cell variable.
-
-Complete code for creating network objects:
-
-```python
-from mindformers.experimental.parallel_core.pynative.optimizer import get_optimizer
-from mindformers.experimental.parallel_core.pynative.training import get_model
-from mindformers.experimental.parallel_core.pynative.training import TrainOneStepCell
-from mindformers.experimental.parallel_core.models import GPTModel
-
-
-def decay_filter(x):
-    return "norm" not in x.name.lower() and "bias" not in x.name.lower()
-
-
-def set_weight_decay(params, weight_decay=1e-1):
-    decay_params = list(filter(decay_filter, params))
-    other_params = list(filter(lambda x: not decay_filter(x), params))
-    group_params = []
-    if decay_params:
-        group_params.append({"params": decay_params, "weight_decay": weight_decay})
-    if other_params:
-        group_params.append({"params": other_params, "weight_decay": 0.0})
-    return group_params
-
-
-def model_provider_func(pre_process=True, post_process=True):
-    network_with_loss = GPTModel(
-        model_config, pre_process=pre_process, post_process=post_process
-    )
-    return network_with_loss
-
-network_with_loss = get_model(model_provider_func, training_config)
-
-group_params = set_weight_decay(network_with_loss.trainable_params(), optimizer_config.weight_decay)
-optimizer = get_optimizer(
-    optimizer_config,
-    training_config,
-    group_params,
-    network_with_loss,
-    grad_allreduce_op=training_config.loss_reduction
-)
-
-train_one_step_cell = TrainOneStepCell(network_with_loss, optimizer, None, training_config, model_config)
-```
-
-### Loading the Dataset and Performing Training
-
-```python
-from dataset import get_dataset
-from mindformers.experimental.parallel_core.pynative.training import train
-
-train_dataset_iterator, val_dataset_iterator = get_dataset(dataset_config)
-train(
-    train_one_step_cell,
-    train_dataset_iterator,
-    training_config,
-    val_dataset_iterator,
-    metrics,
-    evaluation,
-)
-```
-
-### Running the Training Script
-
-```bash
-bash pretrain_gpt.sh xx.yaml
-```
-
-If xx.yaml is not specified, it defaults to pretrain_gpt_7B.yaml.
-
-The training script `pretrain_gpt.sh` is parsed in detail below:
-
-#### Setting Environment Variables
-
-`HCCL_BUFFSIZE=200` sets the size of the buffer for sharing data between the two NPUs to 200M; `HCCL_EXEC_TIMEOUT=600` sets the wait time for synchronization during execution between the devices to 10 minutes. `ASCEND_RT_VISIBLE_DEVICES` specifies the visible device number, here set to device `0` card.
-
-```bash
-export HCCL_BUFFSIZE=200
-export HCCL_EXEC_TIMEOUT=600
-export ASCEND_RT_VISIBLE_DEVICES='0'
-```
-
-#### Setting Port Number
-
-```bash
-port=8828
-```
-
-If the previous configuration exits abnormally, you can use the following code to clean it up.
-
-```bash
-PIDS=$(sudo lsof -i :$port | awk 'NR>1 {print $2}')
-if [ -n "$PIDS" ]; then
-    for pid in $PIDS; do
-        kill -9 $pid
-        echo "Killed process $pid"
-    done
-else
-    echo "No processes found listening on port $port."
-fi
-```
-
-#### Setting Log Storage Path
-
-Get the path to the directory where the current script is located and store it in the `project_dir` variable, and set the log path variable `log_path=“msrun_log”`. Delete the directory named `msrun_log` (if it exists) and recreate it.
-
-```bash
-project_dir=$(cd "$(dirname "$0")" || exit; pwd)
-log_path="msrun_log"
-
-rm -rf "${log_path}"
-mkdir "${log_path}"
-```
-
-#### Setting the Number of Available Devices
-
-```bash
-# Calculate the number of devices
-IFS=',' read -r -a devices <<< "$ASCEND_RT_VISIBLE_DEVICES"
-work_num=${#devices[@]}
-```
-
-#### Getting the Configuration File
-
-Try to get the configuration file path from the command line arguments, if no command line arguments are provided, the default configuration file “pretrain_gpt_7B.yaml” is used.
-
-```bash
-config_path=$1
-if [ -z "$config_path" ]; then
-    config_path="pretrain_gpt_7B.yaml"
-fi
-```
-
-#### Executing Training Scripts in msrun Mode
-
-```bash
-msrun --worker_num "$work_num" --local_worker_num="$work_num" --master_port=$port --log_dir="$log_path" --join=True --cluster_time_out=300 pretrain_gpt.py --config_path="${config_path}"
-```
-
-#### Running Results
-
-Next, the corresponding script is invoked by command.
-
-```bash
-bash pretrain_gpt.sh
-```
-
-After execution, the log files are saved to the `output` directory, where some of the files have the following directory structure:
-
-```text
-└─ output
-    └─ log
-        ├─ rank_0
-        |   ├─ info.log
-        |   └─ error.log
-        ├─ rank_1
-        |   ├─ info.log
-        |   └─ error.log
-    ...
-```
-
-The results on the Loss section are saved in `output/log/rank_*/info.log`, example below:
-
-```text
-train: Epoch:0, Step:5, Loss: 10.341485, Finite_grads: True, Loss_scale: 4096.0, Learning_rate: (1.250000e-06,1.250000e-06,), Time: 1403.24 ms
-train: Epoch:0, Step:6, Loss: 10.38118, Finite_grads: True, Loss_scale: 4096.0, Learning_rate: (1.250000e-06,1.250000e-06,), Time: 1378.19 ms
-train: Epoch:0, Step:7, Loss: 10.165115, Finite_grads: True, Loss_scale: 4096.0, Learning_rate: (1.250000e-06,1.250000e-06,), Time: 1370.32 ms
-train: Epoch:0, Step:8, Loss: 10.039211, Finite_grads: True, Loss_scale: 4096.0, Learning_rate: (1.250000e-06,1.250000e-06,), Time: 1386.89 ms
-train: Epoch:0, Step:9, Loss: 10.040031, Finite_grads: True, Loss_scale: 4096.0, Learning_rate: (1.250000e-06,1.250000e-06,), Time: 1475.95 ms
-...
-```
diff --git a/docs/mindformers/docs/source_en/usage/quantization.md b/docs/mindformers/docs/source_en/usage/quantization.md
deleted file mode 100644
index a3eade7b724d805a9060f4701aac84a0f4228ad1..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_en/usage/quantization.md
+++ /dev/null
@@ -1,19 +0,0 @@
-# Quantization
-
-[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_en/usage/quantization.md)
-
-## Overview
-
-Quantization is an important technology for compressing foundation models. It converts floating-point parameters in a model into low-precision integer parameters to compress the parameters. As the parameters and specifications of a model increase, quantization can effectively reduce the model storage space and loading time during model deployment, improving the model inference performance.
-
-MindSpore Transformers integrates the MindSpore Golden Stick tool component to provide a unified quantization inference process, facilitating out-of-the-box use. Please refer to [MindSpore Golden Stick Installation Tutorial](https://www.mindspore.cn/golden_stick/docs/en/r1.1.0/install.html) for installation and [MindSpore Golden Stick Application PTQ algorithm](https://www.mindspore.cn/golden_stick/docs/en/r1.1.0/ptq/ptq.html) to quantify the models in MindSpore Transformers.
-
-## Model Support
-
-Currently, only the following models are supported, and the supported models are continuously being added.
-
-| Supported Model                                                                                                                      |
-|--------------------------------------------------------------------------------------------------------------------------------------|
-| [DeepSeek-V3](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/deepseek3/deepseek3_671b/predict_deepseek3_671b.yaml)     |
-| [DeepSeek-R1](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/deepseek3/deepseek_r1_671b/predict_deepseek_r1_671b.yaml) |
-| [Llama2](https://gitee.com/mindspore/mindformers/blob/r1.5.0/configs/llama2/predict_llama2_13b_ptq.yaml)                             |
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_en/usage/sft_tuning.md b/docs/mindformers/docs/source_en/usage/sft_tuning.md
deleted file mode 100644
index 5c38bd093d6d291190619470ead6810f72fe363a..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_en/usage/sft_tuning.md
+++ /dev/null
@@ -1,256 +0,0 @@
-# Supervised Fine-Tuning (SFT)
-
-[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_en/usage/sft_tuning.md)
-
-## Overview
-
-SFT (Supervised Fine-Tuning) employs supervised learning ideas and refers to the process of adjusting some or all of the parameters based on a pre-trained model to make it more adaptable to a specific task or dataset.
-
-## Process
-
-SFT consists of the following steps:
-
-- **Pretraining:**
-  A neural network model is trained on a large-scale dataset. For example, an LLM is trained on a large amount of unlabeled text data. The objective of the pre-training phase is to enable the model to obtain common knowledge and understanding capabilities.
-- **Fine-tuning:**
-  Based on the target task, the obtained pretrained model is fine-tuned by using the new training dataset. During fine-tuning, all or some parameters of the original model can be optimized through backpropagation to achieve a better effect of the model on the target task.
-- **Evaluation:**
-  After fine-tuning, a new model is obtained. The fine-tuning model may be evaluated by using the evaluation dataset of the target task to obtain performance metrics of the fine-tuning model on the target task.
-
-Based on actual operations, SFT may be decomposed into the following steps:
-
-1. **Selecting a pretrained model:**
-   Select a pretrained language model, for example, GPT-2 or Llama2. The pretrained model is trained on a large text corpus to learn a general representation of a language.
-2. **Downloading the model weights:**
-   For the selected pretrained model, download the pretrained weights from the HuggingFace model library.
-3. **Converting model weights:**
-   Convert the downloaded HuggingFace weight based on the required framework, for example, convert it to the CKPT weights supported by the MindSpore framework.
-4. **Preparing a dataset:**
-   Select a dataset for fine-tuning tasks based on the fine-tuning objective. For LLMs, the fine-tuning dataset is data that contains text and labels, for example, the alpaca dataset. When using a dataset, you need to preprocess the corresponding data. For example, when using the MindSpore framework, you need to convert the dataset to the MindRecord format.
-5. **Performing a fine-tuning task:**
-   Use the dataset of the fine-tuning task to train the pre-trained model and update the model parameters. If all parameters are fine-tuned, all parameters are updated. After the fine-tuning task is complete, a new model can be obtained.
-
-## SFT Fine-Tuning Methods
-
-MindSpore Transformers currently supports two SFT fine-tuning methods: full-parameter fine-tuning and LoRA low-parameter fine-tuning. Full-parameter fine-tuning refers to updating all parameters during training, which is suitable for large-scale data fine-tuning, and can get the optimal adaptability to the task, but requires larger computational resources.LoRA low-parameter fine-tuning only updates some parameters during training, which uses less memory and is faster than full-parameter fine-tuning, but is not as effective as full-parameter fine-tuning in some tasks.
-
-### Introduction to the LoRA Principle
-
-LoRA achieves a significant reduction in the number of parameters by decomposing the weight matrix of the original model into two low-rank matrices. For example, suppose a weight matrix W has size m x n. With LoRA, this matrix is decomposed into two low-rank matrices A and B, where A has size m x r and B has size r x n (r is much smaller than m and n). During the fine-tuning process, only these two low-rank matrices are updated without changing the rest of the original model.
-
-This approach not only drastically reduces the computational overhead of fine-tuning, but also preserves the original performance of the model, which is especially suitable for model optimization in environments with limited data volume and restricted computational resources. For detailed principles, you can check the paper [LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685).
-
-## Using MindSpore Transformers for Full-Parameter Fine-Tuning
-
-### Selecting a Pretrained Model
-
-MindSpore Transformers supports mainstream foundation models in the industry. This practice uses the Llama2-7B model for SFT as an example.
-
-### Downloading the Model Weights
-
-MindSpore Transformers provides pretrained weights and vocabulary files that have been converted for pretraining, fine-tuning, and inference. You can also download the official HuggingFace weights and convert model weights before using these weights.
-
-You can download the vocabulary at [tokenizer.model](https://ascend-repo-modelzoo.obs.cn-east-2.myhuaweicloud.com/MindFormers/llama2/tokenizer.model).
-
-| Model     |    MindSpore Weight   |     HuggingFace Weight        |
-|:----------|:------------------------:| :----------------------: |
-| Llama2-7B |  [Link](https://ascend-repo-modelzoo.obs.cn-east-2.myhuaweicloud.com/MindFormers/llama2/llama2_7b.ckpt)      | [Link](https://huggingface.co/meta-llama/Llama-2-7b-hf) |
-
-> All weights of Llama2 need to be obtained by [submitting an application](https://ai.meta.com/resources/models-and-libraries/llama-downloads) to Meta. If necessary, apply for the weights by yourself.
-
-### Converting Model Weights
-
-Take the [Llama2-7B model](https://huggingface.co/meta-llama/Llama-2-7b-hf/tree/main) as an example. The original HuggingFace weight file contains the following information:<br>
-
-- `config.json`: main configuration information of the model architecture.<br>
-- `generation_config.json`: configuration information related to text generation.<br>
-- `safetensors file`: model weight file.<br>
-- `model.safetensors.index.json`: JSON file that describes safetensors model parameter file index and model slices.<br>
-- `bin file`: PyTorch model weight file.<br>
-- `pytorch_model.bin.index.json`: JSON file that describes PyTorch index and model slices.<br>
-- `tokenizer.json`: tokenizer vocabulary configuration file.<br>
-- `tokenizer.model`: tokenizer of the model.<br>
-
-MindSpore Transformers provides a weight conversion script. You can run the conversion script [convert_weight.py](https://gitee.com/mindspore/mindformers/blob/r1.5.0/convert_weight.py) to convert the HuggingFace weights to the complete CKPT weights.
-
-```bash
-python convert_weight.py --model llama --input_path TORCH_CKPT_DIR --output_path {path}/MS_CKPT_NAME
-```
-
-Parameters:
-
-```commandline
-model:       model name. For details about other models, see the model description document.
-input_path:  path of the folder where the HuggingFace weight is downloaded.
-output_path: path for storing the converted MindSpore weight file.
-```
-
-### Preparing a Dataset
-
-MindSpore Transformers provides **WikiText2** as the pretraining dataset and **alpaca** as the fine-tuning dataset.
-
-| Dataset    |                 Applicable Model                 |   Applicable Phase   |              Download Link     |
-|:----------|:-------------------------------------:|:---------:| :--------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-| alpaca    | Llama2-7B<br>Llama2-13B<br>Llama2-70B |    Fine-tuning    |                   [Link](https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json)                   |
-
-The following uses the alpaca dataset as an example. After downloading the dataset, you need to preprocess it. For details about how to download the `tokenizer.model` used in preprocessing, see the model weight download.
-
-**alpaca Data Preprocessing**
-
-1. Run the [alpaca_converter.py script](https://gitee.com/mindspore/mindformers/blob/r1.5.0/mindformers/tools/dataset_preprocess/llama/alpaca_converter.py) in MindSpore Transformers to convert the dataset into the multi-round dialog format.
-
-    ```bash
-    python alpaca_converter.py \
-      --data_path /{path}/alpaca_data.json \
-      --output_path /{path}/alpaca-data-conversation.json
-    ```
-
-    Parameters:
-
-    ```commandline
-    data_path:   path of the file to be downloaded.
-    output_path: path for storing output files.
-    ```
-
-2. Run the [llama_preprocess.py script](https://gitee.com/mindspore/mindformers/blob/r1.5.0/mindformers/tools/dataset_preprocess/llama/llama_preprocess.py) in MindSpore Transformers to convert the data into the MindRecord format. This operation depends on the fastchat tool package to parse the prompt template. You need to install fastchat 0.2.13 or later in advance.
-
-    ```bash
-    python llama_preprocess.py \
-      --dataset_type qa \
-      --input_glob /{path}/alpaca-data-conversation.json \
-      --model_file /{path}/tokenizer.model \
-      --seq_length 4096 \
-      --output_file /{path}/alpaca-fastchat4096.mindrecord
-    ```
-
-    Parameters:
-
-    ```commandline
-    dataset_type: type of the data to be preprocessed.
-    input_glob:   path of the converted alpaca file.
-    model_file:   path of the tokenizer.model file.
-    seq_length:   sequence length of the output data.
-    output_file:  path for storing output files.
-    ```
-
-### Performing a Fine-tuning Task
-
-#### Single-Card Training
-
-Execute `run_mindformer.py` to start the fine-tuning task on a single card. Below is an example usage:
-
-Taking the fine-tuning of the Llama2 model on a single card as an example, due to the limited NPU memory, it is not possible to run the full Llama2-7B model, so we reduce the layers for the example. Modify `finetune_llama2_7b.yaml` and set `num_layers` to 2.
-
-The startup command is as follows:
-
-```shell
-python run_mindformer.py \
- --config configs/llama2/finetune_llama2_7b.yaml \
- --train_dataset_dir /{path}/alpaca-fastchat4096.mindrecord \
- --load_checkpoint /{path}/llama2_7b.ckpt \
- --use_parallel False \
- --run_mode finetune
-```
-
-#### Single-Node Training
-
-Take Llama2-7B as an example. Run the startup script **msrun** to perform 8-device distributed training. The startup command is as follows:
-
-```bash
-bash scripts/msrun_launcher.sh "run_mindformer.py \
- --config configs/llama2/finetune_llama2_7b.yaml \
- --load_checkpoint /{path}/llama2_7b.ckpt \
- --train_dataset_dir /{path}/alpaca-fastchat4096.mindrecord \
- --use_parallel True \
- --run_mode finetune" 8
-```
-
-Parameters:
-
-```commandline
-config:            model configuration file, which is stored in the config directory of the MindSpore Transformers code repository.
-load_checkpoint:   path of the checkpoint file.
-train_dataset_dir: path of the training dataset.
-use_parallel:      specifies whether to enable parallelism.
-run_mode:          running mode. The value can be train, finetune, or predict (inference).
-```
-
-After the task is executed, the **checkpoint** folder is generated in the **mindformers/output** directory, and the model file is saved in this folder.
-
-#### Multi-Node Training
-
-The multi-node multi-device fine-tuning task is similar to the pretrained task. You can refer to the [multi-node multi-device pretraining command](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/usage/pre_training.html#multi-node-training) and modify the command as follows:
-
-1. Add the input parameter `--load_checkpoint /{path}/llama2_7b.ckpt` to the startup script to load the pretrained weights.
-2. Set `--train_dataset_dir /{path}/alpaca-fastchat4096.mindrecord` in the startup script to load the fine-tuning dataset.
-3. Set `--run_mode finetune` in the startup script. **run_mode** indicates the running mode, whose value can be **train**, **finetune**, or **predict** (inference).
-
-After the task is executed, the **checkpoint** folder is generated in the **mindformers/output** directory, and the model file is saved in this folder.
-
-## Using MindSpore Transformers for LoRA Low-Parameter Fine-Tuning
-
-MindSpore Transformers supports configurable enablement of LoRA fine-tuning, which eliminates the need for code adaptation for each model and can be used to perform LoRA low-parameter fine-tuning tasks by simply modifying the model configuration in the YAML configuration file for full-parameter fine-tuning and adding the `pet_config` low-parameter fine-tuning configuration. The following shows the model configuration section of the YAML configuration file for LoRA fine-tuning of the Llama2 model, with a detailed description of the `pet_config` parameter.
-
-### YAML File Example
-
-For details about the complete YAML file, see [the Llama2 LoRA fine-tuning YAML file](https://gitee.com/mindspore/mindformers/blob/r1.5.0/configs/llama2/lora_llama2_7b.yaml).
-
-```yaml
-# model config
-model:
-  model_config:
-    type: LlamaConfig
-    batch_size: 1
-    seq_length: 4096
-    hidden_size: 4096
-    num_layers: 32
-    num_heads: 32
-    vocab_size: 32000
-    compute_dtype: "float16"
-    pet_config:
-      pet_type: lora
-      lora_rank: 16
-      lora_alpha: 16
-      lora_dropout: 0.05
-      target_modules: '.*wq|.*wk|.*wv|.*wo'
-  arch:
-    type: LlamaForCausalLM
-```
-
-### pet_config Parameters
-
-In **model_config**, **pet_config** is the core setting part of LoRA fine-tuning and is used to specify LoRA parameters. The parameters are described as follows:
-
-- **pet_type**: specifies that the type of the parameter-efficient tuning (PET) is LoRA. The LoRA module is inserted in the key layer of the model to reduce the number of parameters required for fine-tuning.
-- **lora_rank**: specifies the rank value of a low-rank matrix. A smaller rank value indicates fewer parameters that need to be updated during fine-tuning, reducing occupation of computing resources. The value **16** is a common equilibrium point, which significantly reduces the number of parameters while maintaining the model performance.
-- **lora_alpha**: specifies the scaling ratio for weight update in the LoRA module. This value determines the amplitude and impact of weight update during fine-tuning. The value **16** indicates that the scaling amplitude is moderate, stabilizing the training process.
-- **lora_dropout**: specifies the dropout probability in the LoRA module. Dropout is a regularization technique used to reduce overfitting risks. The value **0.05** indicates that there is a 5% probability that some neuron connections are randomly disabled during training. This is especially important when the data volume is limited.
-- **target_modules**: specifies the weight matrices to which LoRA applies in the model by using a regular expression. In Llama, the configuration here applies LoRA to the Query (WQ), Key (WK), Value (WV), and Output (WO) matrices in the self-attention mechanism of the model. These matrices play a key role in the Transformer structure. After LoRA is inserted, the model performance can be maintained while the number of parameters is reduced.
-
-### Examples of LoRA Fine-Tuning for Llama2-7B
-
-MindSpore Transformers provides [the LoRA fine-tuning examples](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/llama2.md#lora%E5%BE%AE%E8%B0%83) of Llama2-7B. For details about the dataset used during fine-tuning, see [dataset downloading](https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json).
-
-Take Llama2-7B as an example. You can run the following **msrun** startup script to perform 8-device distributed fine-tuning.
-
-```shell
-bash scripts/msrun_launcher.sh "run_mindformer.py \
- --config configs/llama2/lora_llama2_7b.yaml \
- --train_dataset_dir /{path}/alpaca-fastchat4096.mindrecord \
- --load_checkpoint /{path}/llama2_7b.ckpt \
- --auto_trans_ckpt False \
- --use_parallel True \
- --run_mode finetune" 8
-```
-
-When the distributed strategy of the weights does not match the distributed strategy of the model, the weights need to be transformed. The load weight path should be set to the upper path of the directory named with `rank_0`, and the weight auto transformation function should be enabled by setting `--auto_trans_ckpt True` . For a more detailed description of the scenarios and usage of distributed weight transformation, please refer to [Distributed Weight Slicing and Merging](https://www.mindspore.cn/mindformers/docs/en/r1.5.0/function/transform_weight.html).
-
-```shell
-bash scripts/msrun_launcher.sh "run_mindformer.py \
- --config configs/llama2/lora_llama2_7b.yaml \
- --train_dataset_dir /{path}/alpaca-fastchat4096.mindrecord \
- --load_checkpoint /{path}/checkpoint/ \
- --auto_trans_ckpt True \
- --use_parallel True \
- --run_mode finetune" 8
-```
diff --git a/docs/mindformers/docs/source_zh_cn/_templates/classtemplate.rst b/docs/mindformers/docs/source_zh_cn/_templates/classtemplate.rst
deleted file mode 100644
index 455b11c7a6c9c67b6fff3a943d9b60d85202c356..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_zh_cn/_templates/classtemplate.rst
+++ /dev/null
@@ -1,29 +0,0 @@
-.. role:: hidden
-    :class: hidden-section
-
-.. currentmodule:: {{ module }}
-
-{% if objname in [] %}
-{{ fullname | underline }}
-
-.. autoclass:: {{ name }}
-    :exclude-members: construct
-    :members:
-
-{% elif objname[0].istitle() %}
-{{ fullname | underline }}
-
-.. autoclass:: {{ name }}
-    :exclude-members: construct
-    :members:
-
-{% else %}
-{{ fullname | underline }}
-
-.. autofunction:: {{ fullname }}
-
-{% endif %}
-
-..
-  autogenerated from _templates/classtemplate.rst
-  note it does not have :inherited-members:
diff --git a/docs/mindformers/docs/source_zh_cn/acc_optimize/acc_optimize.md b/docs/mindformers/docs/source_zh_cn/acc_optimize/acc_optimize.md
deleted file mode 100644
index eabf7f6815e7d586dd5a858e8da1bbbd7f4d46b2..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_zh_cn/acc_optimize/acc_optimize.md
+++ /dev/null
@@ -1,492 +0,0 @@
-# 大模型精度调优指南
-
-[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_zh_cn/acc_optimize/acc_optimize.md)
-
-## 精度问题概述和场景
-
-### 描述
-
-随着昇腾AI处理器（以下简称为NPU）在深度学习中的广泛应用，基于昇腾NPU原生开发的MindSpore框架展现出了更好的性能优势。在大规模集群训练过程中，性能的提升将极大节省用户进行大模型开发的成本。因此，越来越多的用户逐渐将原本训练模型迁移至MindSpore中。然而，由于硬件以及框架使用上的差异，用户在完成模型迁移后可能会遇到精度问题。
-
-本文总结了大模型训练过程中常见精度问题及通用的精度问题定位方法，力求帮助用户快速排查精度问题，缩短模型精度问题定位的时间。开始大模型精度调优工作时，应具备大模型的基础知识。为避免发散，本文档将不会解释大模型相关基础概念，聚焦精度调优介绍。
-
-### 常见问题归类总结
-
-大模型训练中经常出现各种精度问题，常见的问题包括loss无法收敛、loss收敛效果不佳、训练后期loss不收敛、精度溢出、loss下降过程中与标杆无法拟合等；造成这些精度问题可能有多种原因，包括模型结构、数据集、超参数、前反向计算精度、优化器计算、浮点计算精度、随机性等。
-
-当出现精度问题时，可以从造成这些精度误差的原因进行问题分析。先根据CheckList快速排查，然后对齐参数和权重、固定随机性和开启确定性计算，接着排查基础问题，最后通过长稳训练排查异常Step的问题。在当前阶段，本文主要针对有精度标杆的场景，介绍精度定位的通用方法，后续将陆续添加无精度标杆下的精度问题定位内容。
-
-## 精度问题定位CheckList
-
-在定位算子精度问题之前，首先要排除其他非算子因素的干扰。结合以往精度定位案例，总结了精度定位前的CheckList。为了在定位过程中少走弯路，用户可先根据CheckList进行快速的排查。
-
-### 网络结构CheckList
-
-#### 通用结构
-
-| **关键参数**&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;          | **说明**                                                     | **检查项**                                                                  |
-| ----------------- | ------------------------------------------------------------ |--------------------------------------------------------------------------|
-| num_layers        | transformer层数                                              | 对应Megatron num-layers参数，检查是否一致。                                          |
-| num_heads         | transformer中attention heads数量                             | 对应Megatron num-attention-heads参数，检查是否一致。                                 |
-| hidden_size       | transformer隐藏层大小                                        | 对应Megatron hidden-size参数，检查是否一致。                                         |
-| intermediate_size | Feed-Forward Network的隐藏层大小                             | 对应Megatron中ffn-hidden-size参数，检查是否一致。                                     |
-| n_kv_heads        | kv分组数                                                     | 对应Megatron中的num-query-groups，检查是否一致。                                     |
-| 正则化函数        | 正则化函数，常见结构有LayerNorm、RMSNorm                     | MindSpore Transformers中使用指定的正则化函数，无法通过配置修改。Megatron中可通过normalization自定义配置，检查是否一致。   |
-| rms_norm_eps      | 正则化的epsilon参数                                          | 对应Megatron的layernorm_epsilon，检查是否一致。                                     |
-| dropout           | 网络中的dropout                                              | 当前MindSpore开启dropout时，不能开重计算；若进行精度比对，建议两边都关闭，减少随机因素。                     |
-| 融合计算          | 常见的融合算子包括FA、ROPE、Norm、SwigLU；部分用户会将Wq、Wk、Wv进行融合计算 | 1. 同硬件下进行精度比对时，若有使用融合算子，需要保持一致。 <br>2. 不同硬件下进行精度比对时，重点检查融合计算部分是否有计算差异。 |
-
-#### MOE结构
-
-| **关键参数**&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;           | **说明**                           | **检查项**                                                                                                             |
-| ------------------------ |----------------------------------|---------------------------------------------------------------------------------------------------------------------|
-| expert_num               | 专家数量                             | 对应Megatron的num-experts，检查是否一致。                                                                                      |
-| num_experts_chosen       | 每个token选择的专家数目                   | 对应Megatron的moe-router-topk，检查是否一致。                                                                                  |
-| capacity_factor          | 专家容量系数                           | 对应Megatron的moe_expert_capacity_factor参数，检查是否一致。                                                                     |
-| aux_loss_factor          | 负载均衡loss贡献因子                     | 开启时，建议小于0.05。若进行精度对齐，不建议开启，否则会与Megatron的loss打印方式不一致。                                                                |
-| enable_sdrop             | 是否开启sdrop（drop实现）方式              | 建议设置成true，对应Megatron需要设置如下参数：<br>  `moe-token-drop-policy: position` <br>  `moe-pad-expert-input-to-capacity: True` |
-| router_dense_type        | 决定专家的dense层                      | MindSpore Transformers中可配置，建议使用FP32计算，防止溢出；Megatron中不可配置。                                                                      |
-| use_fused_ops_topkrouter | 是否使用融合算子进行dispatch以及combine的索引计算 | MindSpore Transformers中融合算子只有在设置`enable_sdrop=True`时才生效，精度对齐建议设置成True。                                                         |
-| use_shared_expert_gating | 共享专家网络中是否使用gating系数              | 检查网络的共享专家是否使用gating系数，如果有，设置成True。                                                                                   |
-
-### 优化器CheckList
-
-| **关键参数**       | **说明**               | **检查项**                                                   |
-| ------------------ | ---------------------- | ------------------------------------------------------------ |
-| adam优化器         | 优化器类型             | 若Megatron使用adam优化器，MindSpore Transformers的数学等价实现为AdamW。 |
-| eps                | adam优化器极小值参数   | 检查参数是否一致，推荐值1e-8。                               |
-| beta1              | adam优化器梯度动量参数 | 检查参数是否一致，推荐值0.9。                                |
-| beta2              | adam优化器梯度方差参数 | 检查参数是否一致，推荐值0.95。                               |
-| weight_decay       | 权重衰减               | 默认情况下bias及一维权重不进行衰减，检查用户是否有特殊操作。 |
-| lr                 | 学习率                 | 在设置了warmup、学习率衰减后，画图查看学习率变化是否一致。   |
-| lr_warmup_fraction | 学习率预热步数占比     | 在设置了warmup、学习率衰减后，画图查看学习率变化是否一致。   |
-| clip_grad          | 修剪梯度               | 检查参数是否一致，推荐值1.0。                                |
-| global_batch_size  | 全局批大小             | 检查参数是否一致，可以通过训练过程中的打印日志检查。         |
-
-### 权重CheckList
-
-| **关键参数**    | **说明**             | **检查项**                                                   |
-| --------------- | -------------------- | ------------------------------------------------------------ |
-| param_init_type | 权重初始化类型       | MindSpore Transformers通常会设置param_init_dtype类型为FP32，这是因为梯度通信类型是跟权重类型一致，控制通信类型为FP32。而Megatron的梯度通信类型默认为FP32，不与权重类型绑定。 |
-| init-method-std | 权重随机初始化的分布 | 若使用权重随机初始化，需要检查随机分布中的mean/std等参数是否一致。 |
-
-### 混合精度CheckList
-
-| **关键参数**&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;            | **说明**                                             | **检查项**                                                                                                             |
-| ---------------------- |----------------------------------------------------|---------------------------------------------------------------------------------------------------------------------|
-| compute_dtype          | 计算精度                                               | Megatron 设置 `--bf16: true` 则为BF16，否则为FP16。                                                                          |
-| layernorm_compute_type | LayerNorm/RMSNorm的计算精度                             | Megatron不可配置，需要检查实现是否保持一致。                                                                                          |
-| softmax_compute_type   | MindSpore使用FA时，内部Softmax固定用FA计算，仅在小算子拼接实现时可配置计算类型。 | Megatron不可配置，需要检查实现是否保持一致。                                                                                          |
-| rotary_dtype           | 旋转位置编码的计算精度                                        | Megatron不可配置，需要检查实现是否保持一致。                                                                                          |
-| 各权重计算             | Embedding、lm_head等各权重精度计算                          | 由于MindSpore Transformers权重初始化需要设置为FP32，而通常计算精度为BF16/FP16，需要确认权重计算前，是否将权重数据类型转为BF16/FP16。                                       |
-| bias add               | 线性层的bias                                           | 线性层若有bias，检查add的计算精度是否一致。                                                                                           |
-| residual add           | 残差相加                                               | 检查残差的计算精度是否与标杆一致。                                                                                                   |
-| loss                   | loss计算模块                                           | 检查整个loss模块的计算精度是否与标杆一致。                                                                                             |
-| 算子高精度模式         | 昇腾算子支持高精度模式                                        | 开启方式： `context.set_context(ascend_config= {"ge_options":{ "global":{ "ge.opSelectImplmode":"high_precision" } } })` |
-
-### 并行策略CheckList
-
-| **关键参数**               | **说明**               | **检查项**                                                   |
-| -------------------------- | ---------------------- | ------------------------------------------------------------ |
-| data_parallel              | 数据并行               | 并行切分会影响通信行为，切分后引入通信的计算跟单卡计算可能会有细微差异。 |
-| model_parallel             | 模型并行               | 并行切分会影响通信行为，切分后引入通信的计算跟单卡计算可能会有细微差异。 |
-| pipeline_stage             | 流水并行               | 并行切分会影响通信行为，切分后引入通信的计算跟单卡计算可能会有细微差异。 |
-| use_seq_parallel           | 对应Megatron短序列并行 | 并行切分会影响通信行为，切分后引入通信的计算跟单卡计算可能会有细微差异。 |
-| enable_parallel_optimizer  | 优化器并行             | 优化器并行MindSpore与PyTorch两个框架的实现方案不同，通信行为不一致。进行精度对齐时，建议关闭。 |
-| micro_batch_interleave_num | 多副本并行             | 优化器并行MindSpore与PyTorch两个框架的实现方案不同，进行精度对齐时，建议关闭。 |
-
-### 其他CheckList
-
-| **关键点**        | **检查项**                                                                                      |
-| ------------- |----------------------------------------------------------------------------------------------|
-| 数据检查      | 查看数据是否异常，可随机抽取部分数据进行decode、encode检查，查看input与label的位置是否正确对应。                                  |
-| 特殊词检查    | 检查bos_token_id、eos_token_id、pad_token_id等特殊ids是否与数据制作时的ids保持一致。                              |
-| inputs_id校验 | 检查Embedding中的inputs_id是否符合0<=inputs_id<vocab_size；若有越界行为，会取脏数据，导致精度异常。                       |
-| 溢出检测      | 溢出状态对齐PyTorch方式，建议使用INFNAN_MODE，即`export MS_ASCEND_CHECK_OVERFLOW_MODE=INFNAN_MODE`。         |
-| 图算融合      | 关闭图算融合，即`enable_graph_kernel: False`。                                                          |
-| 训推模板一致  | 若进行SFT训练，需要确认训练推理时使用的输入模板一致。                                                                 |
-| 版本检查      | 检查MindSpore、MindSpore Transformers、CANN版本是否配套，建议使用最新的配套版本。                                              |
-| 与开源差异    | MindSpore Transformers中已支持主流的开源LLM模型，也经过了较为充分的测试。如果用户基于MindSpore Transformers中开源模型进行开发，可以重点排查与MindSpore Transformers开源模型的差异。 |
-
-## 精度调试工具介绍
-
-精度定位中，主要使用MindSpore的Dump工具，详细介绍参考[Dump功能调试](https://www.mindspore.cn/tutorials/zh-CN/r2.6.0/debug/dump.html)。
-
-MindSpore的Dump工具通过配置JSON文件进行使能，该方式Dump出网络中的所有算子数据，保存tensor及统计信息的statistic.csv表格。以下给出全量算子Dump的JSON示例：
-
-```json
-{
-    "common_dump_settings": {
-        "op_debug_mode": 0,
-        "dump_mode": 0,
-        "path": "/absolute_path",
-        "net_name": "ResNet50",
-        "iteration": "0|5-8|100-120",
-        "saved_data": "tensor",
-        "input_output": 0,
-        "kernels": ["Default/Conv-op12"],
-        "support_device": [0,1,2,3,4,5,6,7]
-    },
-    "e2e_dump_settings": {
-        "enable": true,
-        "trans_flag": true
-    }
-}
-```
-
-配置参数的字段含义参考[Dump功能调试](https://www.mindspore.cn/tutorials/zh-CN/r2.6.0/debug/dump.html)。
-
-配置好JSON文件后，设置Dump环境变量指向配置的JSON文件，需要设置绝对路径：
-
-```shell
-export MINDSPORE_DUMP_CONFIG=${JSON_PATH}
-```
-
-设置环境变量后，启动程序训练，即可获取相应的Dump数据。
-
-### 其他介绍
-
-除了上述介绍的全量算子Dump，工具还支持部分数据Dump、溢出Dump、指定条件Dump等。限于篇幅，感兴趣的用户可以参考[Dump功能调试](https://www.mindspore.cn/tutorials/zh-CN/r2.6.0/debug/dump.html)进行配置使用。此外，还提供了TroubleShooter的网络开发调试，可在权重转换、权重比对等场景使用，详细信息参考[TroubleShooter工具介绍](https://gitee.com/mindspore/toolkits/tree/master/troubleshooter)。
-
-## 模型迁移精度定位通用流程
-
-通过章节[精度问题定位CheckList](#精度问题定位checklist)进行快速的排查。若完成CheckList的检查后，精度问题依然存在且无明显指向时，可通过本章节的精度定位通用流程缩小问题范围，进行下一步排查。当前通用流程主要针对有标杆的场景，下文将以 GPU+PyTorch 与 Ascend+MindSpore 精度对比的场景为例，对精度定位流程进行介绍。
-
-问题定位的主要思路有两点：
-
-* 简化训练的场景，基于单卡/单机、小规模模型复现问题。
-* 固定随机因素，对比训练过程中与标杆的loss差异，定位出产生精度差异的原因。
-
-模型的训练过程可以分解为如下过程：数据输入、前向计算、loss、反向计算、梯度、优化器权重更新、下一个step。下面将结合如下图的流程，介绍如何对训练各阶段进行排查。
-
-![general_process](./image/general_process.png)
-
-### 阶段1：训练前准备
-
-对比 GPU+PyTorch 与 Ascend+MindSpore 精度，需要简化场景及固定随机性，再进行问题的复现。主要有如下三个部分：
-
-* 对齐参数，缩小模型规模，单卡/单机复现问题；
-
-* 加载相同的权重训练；
-
-* 每个step训练相同的数据。
-
-#### 参数对齐
-
-在参数对齐环节，以下参数需注意检查，保证PyTorch与MindSpore参数一致。参数设置说明：
-
-| 参数                 | 参数建议 | 说明                            |
-|--------------------|------|-------------------------------|
-| num_layers         | 2    | 缩小模型规模，方便快速验证在仅有数据并行情况下单卡可运行。 |
-| learning_rate_type | 常量   | 固定学习率，保证与标杆学习率一致。             |
-| warmup_steps       | 0    | warmup的步数。                    |
-| adam_eps           | 1e-8 | 用户若无特殊要求，按照默认值设置。             |
-| dropout            | 0    | 关闭随机性参数，如有其他随机性参数均关闭。         |
-
-由于模型并行、流水并行、序列并行、优化器并行等特性会增加精度对齐难度，建议先关闭，对齐后再逐步增加并行特性。
-
-#### 权重转换
-
-训练过程中，MindSpore与PyTorch加载同一份权重。若是预训练场景，可以使用PyTorch保存一个初始化权重后，转换为MindSpore权重。因为MindSpore的权重名称与PyTorch有差异，权重转换的本质是将PyTorch权重dict中的名字改为MindSpore权重名字，以支持MindSpore加载。权重转换参考[权重转换指导](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/function/weight_conversion.html)。
-
-MindSpore与PyTorch均支持`bin`格式数据，加载相同的数据集进行训练，保证每个step一致。
-
-#### 固定随机性，开启确定性计算
-
-训练过程中固定随机性，开启确定性计算，方式如下：
-
-* NPU添加如下环境变量：
-
-  ```shell
-  export HCCL_DETERMINISTIC=true  # HCCL确定性
-  export ASCEND_LAUNCH_BLOCKING=1  # 硬件确定性
-  ```
-
-* PyTorch代码，在[pretrain_gpt.py](https://github.com/NVIDIA/Megatron-LM/blob/main/pretrain_gpt.py)中，新增seed_all方法，并在main方法中调用，添加方法如下：
-
-  ```python
-  import numpy as np
-  import random
-
-  def seed_all(seed=42):
-      random.seed(seed)
-      os.environ['PYTHONHASHSEED'] = str(seed)
-      np.random.seed(seed)
-      torch.manual_seed(seed)
-      torch.use_deterministic_algorithms(True)
-      torch.cuda.manual_seed_all(seed)
-      torch.cuda.manual_seed(seed)
-      torch.backends.cudnn.deterministic = True
-      torch.backends.cudnn.enable = False
-      torch.backends.cudnn.benchmark = False
-
-  if __name__ == "__main__":
-      seed_all()
-
-      # 原始代码
-  ```
-
-* MindSpore代码，在[run_mindformer.py](https://gitee.com/mindspore/mindformers/blob/r1.5.0/run_mindformer.py)中，新增seed_all方法，并在main方法中调用，添加方法如下：
-
-  ```python
-  import numpy as np
-  import random
-
-  import mindspore
-
-  def seed_all(seed=42):
-      random.seed(seed)
-      os.environ['PYTHONHASHSEED'] = str(seed)
-      np.random.seed(seed)
-      mindspore.set_deterministic(True)
-
-  def main(config):
-      seed_all()
-
-      # 原始代码
-  ```
-
-完成上面的准备工作后，启动单卡训练。若问题未复现，则拓展场景，如添加相关特性、扩大模型规模等，直至问题复现，从而定位到问题原因。若问题复现，或者需要复现的时间比较久，则可以开启阶段2的问题定位。
-
-### 阶段2：基础问题排查
-
-通过对比第一个step（step1）和第二个step（step2）的loss及local norm，依次排查前向计算、反向计算、优化器计算。
-
-#### step1的loss对比
-
-在固定权重、数据集、随机性后，对比训练第一个step的loss值差异。第一个step的loss值由网络的前向计算获得，若与标杆loss的差异较大，则可判定前向计算存在精度差异，这可能是模型结构未对齐、算子精度异常导致。可通过打印或者Dump工具获取MindSpore及PyTorch每层的tensor值。当前工具暂不具备自动比对功能，需要用户人工识别对应关系进行比对。MindSpore Dump工具介绍参考[精度调试工具介绍](#精度调试工具介绍)，PyTorch Dump工具使用可参考[精度工具功能说明](https://gitee.com/ascend/mstt/blob/master/debug/accuracy_tools/msprobe/docs/05.data_dump_PyTorch.md)。
-
-通过PyTorch的api_stack_dump.pkl文件，及MindSpore的statistic.csv文件找到层的对应关系，初步通过max、min、L2Norm判断输入输出的差异程度。若需要进一步的对比，可以加载相应的npy数据进行详细比对。
-
-#### step1的local norm值对比
-
-local norm反映的某个权重切片在该设备上的梯度平方和，与标杆对比local norm值，可以初步评估反向计算的差异。计算公式如下：
-
-$$
-localnorm = \sqrt{x_1^2 + x_2^2 + \cdots + x_n^2}
-$$
-
-其中 $x_1 ， x_2， \cdots， x_n$ 为某一个权重的梯度。MindSpore Transformers中支持通过yaml配置打印local norm，配置方式如下所示：
-
-```yaml
-# wrapper cell config
-runner_wrapper:
-  type: MFTrainOneStepCell
-  local_norm: True
-  scale_sense: 1
-  loss_scale_value: 65536
-  use_clip_grad: True
-```
-
-Megatron中无配置打印local的入参，需要嵌入式修改文件[megatron/core/optimizer/optimizer.py](https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/optimizer/optimizer.py)：
-
-```python
-from megatron.training import get_args, print_rank_0
-
-def get_parameters(self):
-    params = []
-    grad_norm_list = []
-    for param_group in self.optimizer.param_groups:
-        for param in param_group['params']:
-            grad_norm = torch.norm(param.grad, 2)
-            grad_norm_list.append(grad_norm ** 2)
-            params.append(param)
-    # 嵌入式修改
-    print_rank_0(f"print torch local norm:")
-    print_rank_0(grad_norm_list)
-    return params
-```
-
-下图是local norm对比的示例，对比权重对应的local norm值。
-
-![local norm](./image/local_norm.png)
-
-可发现在该图示的场景下，model.tok_embeddings.embedding_weight的local norm值差异较大，可重点排查Embedding的实现及计算精度等。
-
-Local norm值仅作为反向计算是否正确的初步判断，若要深入对比反向计算，需要通过Dump工具逐层对比MindSpore及PyTorch反向计算值。
-
-#### 优化器计算排查
-
-在step1的loss和local norm对齐的情况下，若step2的loss差异较大，则需要进一步排查优化器计算。具体步骤如下：
-
-1. 首先排查影响梯度更新的参数，如检查learning rate、优化器参数、weight decay等是否与标杆一致。
-
-2. 其次排查优化器计算，步骤如下：
-    1. 保存PyTorch step1的梯度。
-
-    2. 在MindSpore step1加载PyTorch的梯度进行优化器更新。
-
-    3. 对比更新后的权重差异或step2的loss值差异。
-
-若有显著差异，则说明优化器更新存在问题，需要进一步针对优化器进行定位。
-
-PyTorch保存权重梯度，以使用apex为例，修改[megatron/core/optimizer/optimizer.py](https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/optimizer/optimizer.py)文件：
-
-```python
-import numpy as np
-
-def get_parameters(self):
-    params = []
-    grad_id = 0
-    for param_group in self.optimizer.param_groups:
-        for param in param_group['params']:
-            params.append(param)
-            grad_id += 1
-            # 嵌入式修改，将torch的梯度保存为numpy
-            np.save(f"xx/grad_{grad_id}.npy", param)
-    return params
-```
-
-MindSpore Transformers加载梯度参考[mindformers/wrapper/wrapper.py](https://gitee.com/mindspore/mindformers/blob/r1.5.0/mindformers/wrapper/wrapper.py)实现。注意，需要用户自行找到MindSpore Transformers与PyTorch梯度的对应关系，参考如下修改代码：
-
-```python
-class MFTrainOneStepCell(nn.TrainOneStepWithLossScaleCell):
-    ...
-    def __init__(self):
-        # 嵌入式修改，加载torch的权重
-        grad_0 = Tensor(np.load(f"xxx/grad_1.npy"))
-        grad_1 = Tensor(np.load(f"xxx/grad_x.npy"))
-        ...
-        self.grads = [grad_0, grad_1, ..., ]
-
-    def construct(self, *inputs):
-        ...
-        # 嵌入式修改，将梯度强制替换为torch梯度
-        grads = self.grads
-        if self.use_clip_grad:
-            grads, global_norm = self.clip_grad_norm(grads)
-```
-
-以上代码，仅为实现参考，需要根据实际情况进行代码修改。
-
-若排查出优化器计算不存在问题，同时step2的loss差异较大，则需要通过Dump方式重新详细对比step1的反向计算。
-
-### 阶段3：长稳训练排查
-
-经过上述操作对齐step1和step2的loss及local norm，排查前向计算、反向计算、优化器更新后，启动长稳训练，对比每个step的loss。
-
-#### 权重不更新实验
-
-设置learning rate = 0，即权重不更新，训练1千step，对比loss值及global norm的差异。在当前阶段，由于数据较多，详细对比每个step每个权重的local norm工作量大，因此通过对比global norm来判断反向计算误差，是一种简单的快速验证反向计算的方式。若有某个step loss或norm的值差异较大，则单独使用该数据分析前向及反向。注意，global norm在Megatron打印的字段为grad norm。
-
-#### 标杆误差确认
-
-在进行权重更新的训练前，需要先确认标杆误差，即关闭确定性计算，重复跑两次标杆训练，查看标杆自身的误差，以此判断误差是否合理。由于硬件或底层调用算子的差异，训练的计算过程会不可避免地存在一定的误差。MindSpore与标杆模型进行loss对比时，若误差在标杆误差范围内，且误差围绕0轴上下波动，则可以认为误差合理。
-
-#### loss发散
-
-设置learning rate > 0，权重更新，进行长稳测试。训练至某个step出现loss差异较大的现象，之后训练loss开始发散，如图所示：
-
-![loss1](./image/loss1.png)
-
-在该场景下，可针对突变前后的训练进行排查，可尝试如下排查方式：
-
-* 检查loss突变附近的数据情况，排查是否有异常数据。通过tokenizer将数据decode为文字，查看数据是否异常；同时可尝试跳过这批数据进行训练，验证是否由数据导致。
-
-* 检查在突变附近是否有精度溢出情况。
-
-* 可以查看local norm是否有异常，检查Dump突变step的训练数据，排查计算的突变点，分析是否有算子异常输出。
-
-#### loss后期差异较大
-
-长稳测试中，还可能出现训练前期拟合较好，后期收敛loss出现较大差异，如图所示：
-
-![loss2](./image/loss2.png)
-
-在该场景下，可从如下角度进行排查：
-
-* 排查参数是否对齐：重点排查与优化器相关的参数，如优化器类型、learning rate、weight decay等。可通过画图对比训练过程中的learning rate变化是否一致，另外需要确认进行weight decay的权重是否与标杆一致。
-
-* 混合精度排查：通过Dump工具，细致排查计算过程中混合精度是否与标杆一致。
-
-* 若收敛时loss存在差异，但差异很小，如小于1%，可通过评测下游任务进行精度验收。
-
-#### 场景扩展
-
-在完成单卡对齐的情况下，逐步由单卡扩展为多卡测试、集群测试，模型规模、相关特性如模型并行以及流水并行、优化器并行等，视情况添加。由简单场景逐步扩展至实际训练的场景，从而排查新增的特性对精度的影响。
-
-### 大模型迁移精度标准
-
-大模型迁移精度标准是指，将其他第三方硬件或框架训练完成的模型，迁移至 MindSpore 和昇腾硬件后，为保证迁移前后模型精度基本持平，对关键指标设置的精度标准，该标准根据 MindSpore 大模型实际迁移场景总结形成，供开发者参考。由于大模型的精度与应用领域、模型结构、参数量、超参等强相关，且不具备完全的可解释性，目前没有形成完整统一的强制标准。因此，该标准仅作为参考标准，帮助用户对模型迁移精度做出基本的判断。
-
-#### 精度标准规范
-
-1. 相对误差统一按照百分比（x.x%）形式描述，绝对误差统一按照小数（0.xx）形式描述；
-2. 如果第三方模型训练的精度波动情况已不符合该精度标准，应对原模型进行充分测试，并按照原模型波动情况放宽标准；
-
-#### 默认配置
-
-| 类别               | 默认值 | 说明                      |
-|--------------------|------|-------------------------------|
-| 数据集         | [pretrain] wikitext-103 </br>[sft] alpaca   | |
-| 精度模式       | BF16   | 混合精度配置保持一致，并注意区分网络中各API实际的 FP32/FP16/BF16 配置情况。             |
-| 并行方式       | 数据并行    | 可根据计算资源调整并行方式。 |
-| 集群规模       | 单机8卡 | 可根据计算资源调整。             |
-| checkpoint     | [pretrain] 脚本默认初始化 </br> [sft]加载预训练权重    | ckpt对精度指标影响较大，优先选择loss波动小，整体loss下降趋势明显的权重。|
-|确定性|打开|确定精度指标阶段可以关闭确定性。比对阶段需打开确定性，以便减少随机误差干扰。|
-
-#### 精度标准指标
-
-* 测试标准
-
-    1. 无用户特殊指定下，默认连续观测5000个step或12个小时，可根据资源情况缩减step数，但不建议小于1000个step。
-    2. 加载相同的权重，保持所有超参配置一致，关闭所有随机性。
-    3. loss等指标的波动受模型、权重、超参的影响较大，优先选择loss波动平稳的组合作为标杆，减少随机波动对精度结果的判断。
-    4. 对第三方模型的随机性进行充分的测试，在关闭确定性的情况下，重复实验至少2次，观察精度指标的波动范围。
-
-* loss 精度标准
-
-    1. 首个loss绝对误差小于 0.005，或相对误差小于 0.5%。
-    2. 平均绝对误差小于 0.01，或平均相对误差小于 1%。
-
-* 监控指标
-
-    global norm 平均相对误差不超过 10% 。
-
-### 案例详解
-
-本节将结合实际案例，介绍基于上述的精度定位流程完成精度排查。
-
-#### 问题现象
-
-在128卡集群下训练模型，使用 Ascend+MindSpore 训练与 GPU+PyTorch 训练进行对比，发现训练后期收敛的loss比 GPU+PyTorch 高0.1左右。如图所示，收敛不符合预期：
-
-![loss3](./image/loss3.png)
-
-红色线为 Ascend+MindSpore 训练曲线，蓝色线为 GPU+PyTorch 训练曲线。
-
-#### 问题定位过程
-
-在定位前，先对照CheckList进行检查，确认无误后启动问题的定位。
-
-首先step1的loss对齐确认没问题。对比step1的local norm，计算每个权重的local norm值与标杆的差异，发现Embedding权重的local norm值与标杆的差异大。
-
-![local norm](./image/local_norm.png)
-
-排查原因为MindSpore Transformers使用FP32进行权重初始化，前向计算及反向计算Embedding时均使用FP32精度计算；而PyTorch的前向及反向计算均为BF16，由此导致了计算出来的local norm值存在差异。
-
-计算精度对齐后，排查优化器计算也没有问题，开始进行长稳训练对齐。
-
-长稳训练排查将由单卡实验扩展到多卡实验，先设置learning rate=0，即权重不更新。前向计算每个step的loss差异在0.001左右，前向计算误差符合预期。反向计算每个step的global norm差异在0.05左右，反向计算差异不大；初步判断模型迁移代码正确，模型结构一致，前反向计算差异不大。
-
-![loss4](./image/loss4.png)
-
-再权重更新，单卡训练，设置learning rate=1e-5，训练1千step。收敛后期loss有稳定的0.1的差异，复现问题。
-
-![loss5](./image/loss5.png)
-
-进行问题排查。识别如下问题：
-
-* 通过Dump的文件排查，识别训练过程中存在计算精度不一致的地方，并将不一致的地方统一。
-
-* Weight decay实现不一致，用户PyTorch网络所有权重均进行weight decay。MindSpore Transformers中bias权重及一维权重默认不进行weight decay。
-
-修复问题后，再次进行实验，训练1万step，loss差异在0轴附近波动，且小于0.03， 精度符合预期，单卡精度对齐。
-
-完成单卡训练后，启动多卡训练测试：设置learning rate=1e-5，训练1千step。训练后期收敛一致，但训练中期存在稳定的0.05误差。
-
-![loss6](./image/loss6.png)
-
-为验证该误差在合理范围内，关闭确定性计算，重复跑两次GPU实验。图中红线为MindSpore训练的曲线，蓝色、绿色线分别是第一次、第二次GPU训练的曲线。在7千step左右训练不稳定处，MindSpore训练的曲线正处于两次GPU训练的曲线之间，说明误差处于合理范围内，问题最终解决。
-
-![loss7](./image/loss7.png)  
diff --git a/docs/mindformers/docs/source_zh_cn/acc_optimize/image/general_process.png b/docs/mindformers/docs/source_zh_cn/acc_optimize/image/general_process.png
deleted file mode 100644
index 9b58f3a1af994c57b0ba6e5b8bf0a27801623c2a..0000000000000000000000000000000000000000
Binary files a/docs/mindformers/docs/source_zh_cn/acc_optimize/image/general_process.png and /dev/null differ
diff --git a/docs/mindformers/docs/source_zh_cn/acc_optimize/image/local_norm.png b/docs/mindformers/docs/source_zh_cn/acc_optimize/image/local_norm.png
deleted file mode 100644
index c648c187c6be5da9dc29c360f5c527fb0d40b644..0000000000000000000000000000000000000000
Binary files a/docs/mindformers/docs/source_zh_cn/acc_optimize/image/local_norm.png and /dev/null differ
diff --git a/docs/mindformers/docs/source_zh_cn/acc_optimize/image/loss1.png b/docs/mindformers/docs/source_zh_cn/acc_optimize/image/loss1.png
deleted file mode 100644
index c665b20eaf5ff0b40f0da7c6dd7724cc219e9491..0000000000000000000000000000000000000000
Binary files a/docs/mindformers/docs/source_zh_cn/acc_optimize/image/loss1.png and /dev/null differ
diff --git a/docs/mindformers/docs/source_zh_cn/acc_optimize/image/loss2.png b/docs/mindformers/docs/source_zh_cn/acc_optimize/image/loss2.png
deleted file mode 100644
index fef240e4e62ddb3b342877efd0c0c6e908462dff..0000000000000000000000000000000000000000
Binary files a/docs/mindformers/docs/source_zh_cn/acc_optimize/image/loss2.png and /dev/null differ
diff --git a/docs/mindformers/docs/source_zh_cn/acc_optimize/image/loss3.png b/docs/mindformers/docs/source_zh_cn/acc_optimize/image/loss3.png
deleted file mode 100644
index 15cfd9315ec6ad44caf532e0901d71fb8dfc3c80..0000000000000000000000000000000000000000
Binary files a/docs/mindformers/docs/source_zh_cn/acc_optimize/image/loss3.png and /dev/null differ
diff --git a/docs/mindformers/docs/source_zh_cn/acc_optimize/image/loss4.png b/docs/mindformers/docs/source_zh_cn/acc_optimize/image/loss4.png
deleted file mode 100644
index 24fe8e8d01c7afa149d65eaab8eee89a7b600bc5..0000000000000000000000000000000000000000
Binary files a/docs/mindformers/docs/source_zh_cn/acc_optimize/image/loss4.png and /dev/null differ
diff --git a/docs/mindformers/docs/source_zh_cn/acc_optimize/image/loss5.png b/docs/mindformers/docs/source_zh_cn/acc_optimize/image/loss5.png
deleted file mode 100644
index 355cf5e1c247c8aff4938c7bc7756e318cc2ff2e..0000000000000000000000000000000000000000
Binary files a/docs/mindformers/docs/source_zh_cn/acc_optimize/image/loss5.png and /dev/null differ
diff --git a/docs/mindformers/docs/source_zh_cn/acc_optimize/image/loss6.png b/docs/mindformers/docs/source_zh_cn/acc_optimize/image/loss6.png
deleted file mode 100644
index c4061f5c18e886d1036001c0d509e0a3974b8684..0000000000000000000000000000000000000000
Binary files a/docs/mindformers/docs/source_zh_cn/acc_optimize/image/loss6.png and /dev/null differ
diff --git a/docs/mindformers/docs/source_zh_cn/acc_optimize/image/loss7.png b/docs/mindformers/docs/source_zh_cn/acc_optimize/image/loss7.png
deleted file mode 100644
index 4260277be9d8f46619b7e26531adee7c4f4138b4..0000000000000000000000000000000000000000
Binary files a/docs/mindformers/docs/source_zh_cn/acc_optimize/image/loss7.png and /dev/null differ
diff --git a/docs/mindformers/docs/source_zh_cn/appendix/conf_files.md b/docs/mindformers/docs/source_zh_cn/appendix/conf_files.md
deleted file mode 100644
index 22de2eaffdf333b949052862435a86b316192ad8..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_zh_cn/appendix/conf_files.md
+++ /dev/null
@@ -1,328 +0,0 @@
-# 配置文件说明
-
-[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_zh_cn/appendix/conf_files.md)
-
-## 概述
-
-在模型的训练和推理过程中通常需要配置不同的参数，MindSpore Transformers支持使用`YAML`文件集中管理和调整可配置项，使模型的配置更加结构化，同时提高了其可维护性。
-
-## YAML文件内容说明
-
-MindSpore Transformers提供的`YAML`文件中包含对于不同功能的配置项，下面按照配置项的内容对其进行说明。
-
-### 基础配置
-
-基础配置主要用于指定MindSpore随机种子以及加载权重的相关设置。
-
-| 参数                | 说明                                                                                                                                                                                                                   | 类型   |
-|-------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------|
-| seed              | 设置全局种子，详情可参考[mindspore.set_seed](https://www.mindspore.cn/docs/zh-CN/r2.6.0/api_python/mindspore/mindspore.set_seed.html)。                                                                                            | int  |
-| run_mode          | 设置模型的运行模式，可选`train`、`finetune`、`eval`或`predict`。                                                                                                                                                                      | str  |
-| output_dir        | 设置保存log、checkpoint、strategy等文件的路径。                                                                                                                                                                                    | str  |
-| load_checkpoint   | 加载权重的文件或文件夹路径，目前有3个应用场景：<br/>1. 支持传入完整权重文件路径。<br/>2. 支持传入离线切分后的权重文件夹路径。<br/>3. 支持传入包含lora权重和base权重的文件夹路径。<br/>各种权重的获取途径可参考[权重转换功能](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/function/weight_conversion.html)。 | str  |
-| auto_trans_ckpt   | 是否开启在线权重自动转换功能，详情可参考[权重转换功能](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/function/weight_conversion.html)。                                                                                                    | bool |
-| resume_training   | 是否开启断点续训功能，详情可参考[断点续训功能](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/function/resume_training.html#%E6%96%AD%E7%82%B9%E7%BB%AD%E8%AE%AD)。                                                                     | bool |
-| load_ckpt_format  | 加载的模型权重的格式，可选`ckpt`、`safetensors`。                                                                                                                                                                                    | str  |
-| remove_redundancy | 加载的模型权重是否去除了冗余。默认值为`False`。                                                                                                                                                                                           | bool |
-| train_precision_sync | 训练确定性计算开关。默认值为`None` 。                                                                                                                                                                                                                                                                                       | Optional[bool] |
-| infer_precision_sync | 推理确定性计算开关。默认值为`None`。                                                                                                                                                                                                                                                                                        | Optional[bool] |
-
-### Context配置
-
-Context配置主要用于指定[mindspore.set_context](https://www.mindspore.cn/docs/zh-CN/r2.6.0/api_python/mindspore/mindspore.set_context.html)中的相关参数。
-
-| 参数                        | 说明                                                         | 类型     |
-| --------------------------- | ------------------------------------------------------------ | -------- |
-| context.mode                | 设置后端执行模式，`0`表示GRAPH_MODE，MindSpore Transformers目前仅支持在GRAPH_MODE模式下运行。 | int      |
-| context.device_target       | 设置后端执行设备，MindSpore Transformers仅支持在`Ascend`设备上运行。      | str      |
-| context.device_id           | 设置执行设备ID，其值必须在可用设备范围内，默认值为`0`。        | int      |
-| context.enable_graph_kernel | 是否开启图算融合去优化网络执行性能，默认值为`False`。 | bool     |
-| context.max_call_depth      | 设置函数调用的最大深度，其值必须为正整数，默认值为`1000`。     | int      |
-| context.max_device_memory   | 设置设备可用的最大内存，格式为"xxGB"，默认值为`1024GB`。       | str      |
-| context.mempool_block_size  | 设置内存块大小，格式为"xxGB"，默认值为`1GB`。                  | str      |
-| context.save_graphs         | 在执行过程中保存编译图。<br/>1. `False`或`0`表示不保存中间编译图。<br/>2. `1`表示运行时会输出图编译过程中生成的一些中间文件。<br/>3. `True`或`2`表示生成更多后端流程相关的IR文件。<br/>4. `3`表示生成可视化计算图和更多详细的前端IR图。 | bool/int |
-| context.save_graphs_path    | 保存编译图的路径。                                             | str      |
-| context.affinity_cpu_list   | 可选配置项，用于实现用户自定义绑核策略。不配置时，默认绑核。`None`表示关闭绑核。默认值为`{}`，如想使能自定义绑核策略，需传入`dict`，详情可参考[mindspore.runtime.set_cpu_affinity](https://www.mindspore.cn/docs/zh-CN/r2.6.0/api_python/runtime/mindspore.runtime.set_cpu_affinity.html#mindspore.runtime.set_cpu_affinity)。 | dict/str      |
-
-### 模型配置
-
-由于不同的模型配置会有差异，这里仅对MindSpore Transformers中模型的通用配置进行说明。
-
-| 参数                                         | 说明                                                                                               | 类型   |
-|--------------------------------------------|--------------------------------------------------------------------------------------------------|------|
-| model.arch.type                            | 设置模型类，构建模型时可以根据模型类对模型进行实例化。                                                                       | str  |
-| model.model_config.type                    | 设置模型配置类，模型配置类需要与模型类匹配使用，即模型配置类中应包含所有模型类使用的参数。                                                     | str  |
-| model.model_config.num_layers              | 设置模型层数，通常指模型Decoder Layer的层数。                                                                     | int  |
-| model.model_config.seq_length              | 设置模型序列长度，该参数表示模型所支持的最大序列长度。                                                                       | int  |
-| model.model_config.hidden_size             | 设置模型隐藏状态的维数。                                                                                      | int  |
-| model.model_config.vocab_size              | 设置模型词表大小。                                                                                         | int  |
-| model.model_config.top_k                   | 设置推理时从概率最大的`top_k`个tokens中采样。                                                                     | int  |
-| model.model_config.top_p                   | 设置推理时从概率最大且概率累计不超过`top_p`的tokens中采样。                                                              | int  |
-| model.model_config.use_past                | 是否开启模型增量推理，开启后可使用Paged Attention提升推理性能，在模型训练时必须设置为`False`。                                        | bool |
-| model.model_config.max_decode_length       | 设置生成文本的最大长度，包括输入长度。                                                                               | int  |
-| model.model_config.max_length              | 同`max_decode_length`，与`max_decode_length`同时设置时，`max_length`生效。                                    | int  |
-| model.model_config.max_new_tokens          | 设置生成新文本的最大长度，不包括输入长度，与`max_length`同时设置时，`max_new_tokens`生效。                                       | int  |
-| model.model_config.min_length              | 设置生成文本的最小长度，包括输入长度。                                                                               | int  |
-| model.model_config.min_new_tokens          | 设置生成新文本的最小长度，不包括输入长度，与`min_length`同时设置时，`min_new_tokens`生效。                                       | int  |
-| model.model_config.repetition_penalty      | 设置生成重复文本的惩罚系数，`repetition_penalty`不小于1，等于1时不对重复输出进行惩罚。                                            | int  |
-| model.model_config.block_size              | 设置Paged Attention中block的大小，仅`use_past=True`时生效。                                                  | int  |
-| model.model_config.num_blocks              | 设置Paged Attention中block的总数，仅`use_past=True`时生效，应满足`batch_size×seq_length<=block_size×num_blocks`。 | int  |
-| model.model_config.return_dict_in_generate | 是否以字典形式返回`generate`接口的推理结果，默认为`False`。                                                            | bool |
-| model.model_config.output_scores           | 是否以字典形式返回结果时，包含每次前向生成时的输入softmax前的分数，默认为`False`。                                                  | bool |
-| model.model_config.output_logits           | 是否以字典形式返回结果时，包含每次前向生成时模型输出的logits，默认为`False`。                                                     | bool |
-| model.model_config.layers_per_stage        | 设置开启pipeline stage时，每个stage分配到的transformer层数，默认为`None`，表示每个stage平均分配。设置的值为一个长度为pipeline stage数量的整数列表，第i位表示第i个stage被分配到的transformer层数。                                                | list |
-
-### MoE配置
-
-除了上述模型的基本配置，MoE模型需要单独配置一些moe模块的超参，由于不同模型使用的参数会有不同，仅对通用配置进行说明：
-
-| 参数                                         | 说明                                                                                               | 类型   |
-|--------------------------------------------|--------------------------------------------------------------------------------------------------|------|
-| moe_config.expert_num                    | 设置路由专家数量。                                                     | int  |
-| moe_config.shared_expert_num                    | 设置共享专家数量。                                                     | int  |
-| moe_config.moe_intermediate_size                    | 设置专家层中间维度大小。                                                     | int  |
-| moe_config.capacity_factor              | 设置专家容量因子。                                                                     | int  |
-| moe_config.num_experts_chosen             | 设置每个token选择专家数目。                                                                                      | int  |
-| moe_config.enable_sdrop              | 设置是否使能token丢弃策略`sdrop`，由于MindSpore Transformers的MoE是静态shape实现所以不能保留所有token。                                                                       | bool  |
-| moe_config.aux_loss_factor              | 设置均衡性loss的权重。                                                                       | list[float]  |
-| moe_config.first_k_dense_replace              | 设置moe层的使能block，一般设置为1，表示第一个block不使能moe。                                                                       | int  |
-| moe_config.balance_via_topk_bias              | 设置是否使能`aux_loss_free`负载均衡算法。                                                                                         | bool  |
-| moe_config.topk_bias_update_rate                   | 设置`aux_loss_free`负载均衡算法`bias`更新步长。                                                                     | float  |
-| moe_config.comp_comm_parallel                   | 设置是否开启ffn的计算通信并行。默认值：False。                                                             | bool  |
-| moe_config.comp_comm_parallel_degree                   | 设置ffn计算通信的分割数。数字越大，重叠越多，但会消耗更多内存。此参数仅在comp_com_parallel启用时有效。                                                              | int  |
-| moe_config.moe_shared_expert_overlap                   | 设置是否开启共享专家和路由专家的计算通信并行。默认值：False。                                                              | bool  |
-
-### 模型训练配置
-
-启动模型训练时，除了模型相关参数，还需要设置trainer、runner_config、学习率以及优化器等训练所需模块的参数，MindSpore Transformers提供了如下配置项。
-
-| 参数                                          | 说明                                                                                                                                                                  | 类型    |
-|---------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------|
-| trainer.type                                | 设置trainer类，通常不同应用场景的模型会设置不同的trainer类。                                                                                                                                | str   |
-| trainer.model_name                          | 设置模型名称，格式为'{name}_xxb'，表示模型的某一规格。                                                                                                                                    | str   |
-| runner_config.epochs                        | 设置模型训练的轮数。                                                                                                                                                           | int   |
-| runner_config.batch_size                    | 设置批处理数据的样本数，该配置会覆盖数据集配置中的`batch_size`。                                                                                                                               | int   |
-| runner_config.sink_mode                     | 是否开启数据下沉模式。                                                                                                                                                          | bool  |
-| runner_config.sink_size                     | 设置每次从Host下发到Device的迭代数量，仅`sink_mode=True`时生效，此参数将在后续版本中废弃。                                                                                                           | int   |
-| runner_config.gradient_accumulation_steps   | 设置梯度累积步数，默认值为1，表示不开启梯度累积。                                                                                                                                            | int   |
-| runner_wrapper.type                         | 设置wrapper类，一般设置'MFTrainOneStepCell'即可。                                                                                                                               | str   |
-| runner_wrapper.scale_sense.type             | 设置梯度缩放类，一般设置'DynamicLossScaleUpdateCell'即可。                                                                                                                          | str   |
-| runner_wrapper.scale_sense.use_clip_grad    | 是否开启梯度剪裁，开启可避免反向梯度过大导致训练无法收敛的情况。                                                                                                                                     | bool  |
-| runner_wrapper.scale_sense.loss_scale_value | 设置loss动态尺度系数，模型loss可以根据该参数配置动态变化。                                                                                                                                    | int   |
-| lr_schedule.type                            | 设置lr_schedule类，lr_schedule主要用于调整模型训练中的学习率。                                                                                                                           | str   |
-| lr_schedule.learning_rate                   | 设置初始化学习率大小。                                                                                                                                                          | float |
-| lr_scale                                    | 是否开启学习率缩放。                                                                                                                                                           | bool  |
-| lr_scale_factor                             | 设置学习率缩放系数。                                                                                                                                                           | int   |
-| layer_scale                                 | 是否开启层衰减。                                                                                                                                                             | bool  |
-| layer_decay                                 | 设置层衰减系数。                                                                                                                                                             | float |
-| optimizer.type                              | 设置优化器类，优化器主要用于计算模型训练的梯度。                                                                                                                                             | str   |
-| optimizer.weight_decay                      | 设置优化器权重衰减系数。                                                                                                                                                         | float |
-| train_dataset.batch_size                    | 同`runner_config.batch_size`。                                                                                                                                         | int   |
-| train_dataset.input_columns                 | 设置训练数据集输入的数据列。                                                                                                                                                       | list  |
-| train_dataset.output_columns                | 设置训练数据集输出的数据列。                                                                                                                                                       | list  |
-| train_dataset.column_order                  | 设置训练数据集输出数据列的顺序。                                                                                                                                                     | list  |
-| train_dataset.num_parallel_workers          | 设置读取训练数据集的进程数。                                                                                                                                                       | int   |
-| train_dataset.python_multiprocessing        | 是否开启Python多进程模式提升数据处理性能。                                                                                                                                             | bool  |
-| train_dataset.drop_remainder                | 是否在最后一个批处理数据包含样本数小于batch_size时，丢弃该批处理数据。                                                                                                                             | bool  |
-| train_dataset.repeat                        | 设置数据集重复数据次数。                                                                                                                                                         | int   |
-| train_dataset.numa_enable                   | 设置NUMA的默认状态为数据读取启动状态。                                                                                                                                                | bool  |
-| train_dataset.prefetch_size                 | 设置预读取数据量。                                                                                                                                                            | int   |
-| train_dataset.data_loader.type              | 设置数据加载类。                                                                                                                                                            | str   |
-| train_dataset.data_loader.dataset_dir       | 设置加载数据的路径。                                                                                                                                                           | str   |
-| train_dataset.data_loader.shuffle           | 是否在读取数据集时对数据进行随机排序。                                                                                                                                                  | bool  |
-| train_dataset.transforms                    | 设置数据增强相关选项。                                                                                                                                                          | -     |
-| train_dataset_task.type                     | 设置dataset类，该类用于对数据加载类以及其他相关配置进行封装。                                                                                                                                   | str   |
-| train_dataset_task.dataset_config           | 通常设置为`train_dataset`的引用，包含`train_dataset`的所有配置项。                                                                                                                     | -     |
-| auto_tune                                   | 是否开启数据处理参数自动调优，详情可参考[set_enable_autotune](https://www.mindspore.cn/docs/zh-CN/r2.6.0/api_python/dataset/mindspore.dataset.config.set_enable_autotune.html)。          | bool  |
-| filepath_prefix                             | 设置数据优化后的参数配置的保存路径。                                                                                                                                                   | str   |
-| autotune_per_step                           | 设置自动数据加速的配置调整step间隔，详情可参考[set_autotune_interval](https://www.mindspore.cn/docs/zh-CN/r2.6.0/api_python/dataset/mindspore.dataset.config.set_autotune_interval.html)。 | int   |
-
-### 并行配置
-
-为了提升模型的性能，在大规模集群的使用场景中通常需要为模型配置并行策略，详情可参考[分布式并行](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/function/distributed_parallel.html)，MindSpore Transformers中的并行配置如下。
-
-| 参数                                                              | 说明                                                                                                                                                                                               | 类型   |
-|-----------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------|
-| use_parallel                                                    | 是否开启并行模式。                                                                                                                                                                                         | bool |
-| parallel_config.data_parallel                                   | 设置数据并行数。                                                                                                                                                                                          | int  |
-| parallel_config.model_parallel                                  | 设置模型并行数。                                                                                                                                                                                          | int  |
-| parallel_config.context_parallel                                | 设置序列并行数。                                                                                                                                                                                          | int  |
-| parallel_config.pipeline_stage                                  | 设置流水线并行数。                                                                                                                                                                                         | int  |
-| parallel_config.micro_batch_num                                 | 设置流水线并行的微批次大小，在`parallel_config.pipeline_stage`大于1时，应满足`parallel_config.micro_batch_num` >= `parallel_config.pipeline_stage`。                                                                     | int  |
-| parallel_config.seq_split_num                                   | 在序列流水线并行中设置序列分割数，该数应为序列长度的除数。                                                                     | int  |
-| parallel_config.gradient_aggregation_group                      | 设置梯度通信算子融合组的大小。                                                                                                                                                                                   | int  |
-| parallel_config.context_parallel_algo                      | 设置长序列并行方案，可选`colossalai_cp`、`ulysses_cp`和`hybrid_cp`，仅在`context_parallel`切分数大于1时生效。                                                                                                                                                                                   | str  |
-| parallel_config.ulysses_degree_in_cp                      | 设置Ulysses序列并行维度，与`hybrid_cp`长序列并行方案同步配置，需要确保`context_parallel`可以被该参数整除且大于1，同时确保`ulysses_degree_in_cp`可以被attention head数整除。                                                                                                                                                                      | int  |
-| micro_batch_interleave_num                                      | 设置多副本并行数，大于1时开启多副本并行。通常在使用模型并行时开启，主要用于优化模型并行产生的通信损耗，仅使用流水并行时不建议开启。详情可参考[MicroBatchInterleaved](https://www.mindspore.cn/docs/zh-CN/r2.6.0/api_python/parallel/mindspore.parallel.nn.MicroBatchInterleaved.html)。 | int  |
-| parallel.parallel_mode                                          | 设置并行模式，`0`表示数据并行模式, `1`表示半自动并行模式, `2`表示自动并行模式, `3`表示混合并行模式，一般设置为半自动并行模式。                                                                                                                          | int  |
-| parallel.gradients_mean                                         | 是否在梯度AllReduce后执行平均算子。通常半自动并行模式下设为`False`，数据并行模式下设为`True`。                                                                                                                                        | bool |
-| parallel.enable_alltoall                                        | 是否在通信期间生成AllToAll通信算子。通常仅在MOE场景下设为`True`，默认值为`False`。                                                                                                                                             | bool |
-| parallel.full_batch                                             | 是否在并行模式下从数据集中读取加载完整的批数据，设置为`True`表示所有rank都读取完整的批数据，设置为`False`表示每个rank仅加载对应的批数据，设置为`False`时必须设置对应的`dataset_strategy`。                                                                                                                                                                  | bool |
-| parallel.dataset_strategy                                       | 仅支持`List of List`类型且仅在`full_batch=False`时生效，列表中子列表的个数需要等于`train_dataset.input_columns`的长度，并且列表中的每个子列表需要和数据集返回的数据的shape保持一致。一般在数据的第1维进行数据并行切分，所以子列表的第1位数配置与`data_parallel`相同，其他位配置为`1`。具体原理可以参考[数据集切分](https://www.mindspore.cn/tutorials/zh-CN/master/parallel/dataset_slice.html)。 | list |
-| parallel.search_mode                                            | 设置全自动并行策略搜索模式，可选`recursive_programming`、`dynamic_programming`和`sharding_propagation`，仅在全自动并行模式下生效，实验性接口。                                                                                         | str  |
-| parallel.strategy_ckpt_save_file                                | 设置并行切分策略文件的保存路径。                                                                                                                                                                                  | str  |
-| parallel.strategy_ckpt_config.only_trainable_params             | 是否仅保存（或加载）可训练参数的切分策略信息，默认为`True`，当网络中存在冻结的参数但又需要切分时将该参数设为`False`。                                                                                                                                 | bool |
-| parallel.enable_parallel_optimizer                              | 是否开启优化器并行。<br/>1. 在数据并行模式下将模型权重参数按device数进行切分。<br/>2. 在半自动并行模式下将模型权重参数按`parallel_config.data_parallel`进行切分。                                                                                        | bool |
-| parallel.parallel_optimizer_config.gradient_accumulation_shard  | 设置累计的梯度变量是否在数据并行的维度上进行切分，仅`enable_parallel_optimizer=True`时生效。                                                                                                                                    | bool |
-| parallel.parallel_optimizer_config.parallel_optimizer_threshold | 设置优化器权重参数切分的阈值，仅`enable_parallel_optimizer=True`时生效。                                                                                                                                             | int  |
-| parallel.parallel_optimizer_config.optimizer_weight_shard_size  | 设置优化器权重参数切分通信域的大小，要求该值可以整除`parallel_config.data_parallel`，仅`enable_parallel_optimizer=True`时生效。                                                                                                  | int  |
-| parallel.pipeline_config.pipeline_interleave  | 使能interleave，使用Seq-Pipe流水线并行时需设置为`true`。                                                                                                  | bool  |
-| parallel.pipeline_config.pipeline_scheduler  | Seq-Pipe的流水线调度策略，目前只支持`"seqpipe"`。                                                                                                  | str  |
-
-> 配置并行策略时应满足device_num = data_parallel × model_parallel × context_parallel × pipeline_stage。
-
-### 模型优化配置
-
-1. MindSpore Transformers提供重计算相关配置，以降低模型在训练时的内存占用，详情可参考[重计算](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/perf_optimize/perf_optimize.html#重计算)。
-
-   | 参数                                                 | 说明                            | 类型              |
-   |----------------------------------------------------|-------------------------------|-----------------|
-   | recompute_config.recompute                         | 是否开启重计算。                       | bool/list/tuple |
-   | recompute_config.select_recompute                  | 开启选择重计算，只针对attention层的算子进行重计算。 | bool/list       |
-   | recompute_config.parallel_optimizer_comm_recompute | 是否对由优化器并行引入的AllGather通信进行重计算。  | bool/list       |
-   | recompute_config.mp_comm_recompute                 | 是否对由模型并行引入的通信进行重计算。            | bool            |
-   | recompute_config.recompute_slice_activation        | 是否对保留在内存中的Cell输出切片。            | bool            |
-   | recompute_config.select_recompute_exclude          | 关闭指定算子的重计算，只对Primitive算子有效。    | bool/list       |
-   | recompute_config.select_comm_recompute_exclude     | 关闭指定算子的通讯重计算，只对Primitive算子有效。  | bool/list       |
-
-2. MindSpore Transformers提供细粒度激活值SWAP相关配置，以降低模型在训练时的内存占用，详情可参考[细粒度激活值SWAP](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/function/fine_grained_activations_swap.html)。
-
-   | 参数 | 说明 | 类型 |
-   |------|-----|-----|
-   | swap_config.swap | 是否开启激活值SWAP。 | bool |
-   | swap_config.default_prefetch | 设置激活值卸载至host时的内存释放时机与开始取回device的时机，仅在开启激活值SWAP且未设置layer_swap与op_swap时生效。 | int |
-   | swap_config.layer_swap | 选择特定的层使能激活值SWAP。 | list |
-   | swap_config.op_swap | 选择特定层中的特定算子使能激活值SWAP。 | list |
-
-### Callbacks配置
-
-MindSpore Transformers提供封装后的Callbacks函数类，主要实现在模型训练过程中返回模型的训练状态并输出、保存模型权重文件等一些操作，目前支持以下几个Callbacks函数类。
-
-1. MFLossMonitor
-
-   该回调函数类主要用于在训练过程中对训练进度、模型Loss、学习率等信息进行打印，有如下几个可配置项：
-
-   | 参数                             | 说明                                                                                      | 类型    |
-   |--------------------------------|-----------------------------------------------------------------------------------------|-------|
-   | learning_rate                  | 设置`MFLossMonitor`中初始化学习率，默认值为`None`。                                                     | float |
-   | per_print_times                | 设置`MFLossMonitor`中日志信息打印频率，默认值为`1`，即每一步打印一次日志信息。                                         | int   |
-   | micro_batch_num                | 设置训练中每一步的批数据大小，用于计算实际的loss值，若不配置该参数，则与[并行配置](#并行配置)中`parallel_config.micro_batch_num`一致。 | int   |
-   | micro_batch_interleave_num     | 设置训练中每一步的多副本批数据大小，用于计算实际的loss值，若不配置该参数，则与[并行配置](#并行配置)中`micro_batch_interleave_num`一致。   | int   |
-   | origin_epochs                  | 设置`MFLossMonitor`中训练的轮数，若不配置该参数，则与[模型训练配置](#模型训练配置)中`runner_config.epochs`一致。            | int   |
-   | dataset_size                   | 设置`MFLossMonitor`中初始化数据集大小，若不配置该参数，则与实际训练使用的数据集大小一致。                                     | int   |
-   | initial_epoch                  | 设置`MFLossMonitor`中训练起始轮数，默认值为`0`。                                                        | int   |
-   | initial_step                   | 设置`MFLossMonitor`中训练起始步数，默认值为`0`。                                                        | int   |
-   | global_batch_size              | 设置`MFLossMonitor`中全局批数据样本数，若不配置该参数，则会根据数据集大小以及并行策略自动计算。                                  | int   |
-   | gradient_accumulation_steps    | 设置`MFLossMonitor`中梯度累计步数，若不配置该参数，则与[模型训练配置](#模型训练配置)中`gradient_accumulation_steps`一致。    | int   |
-   | check_for_nan_in_loss_and_grad | 设置是否在`MFLossMonitor`中开启溢出检测，开启后在模型训练过程中出现溢出则退出训练，默认值为`False`。                            | bool  |
-
-2. SummaryMonitor
-
-   该回调函数类主要用于收集Summary数据，详情可参考[mindspore.SummaryCollector](https://www.mindspore.cn/docs/zh-CN/r2.6.0/api_python/mindspore/mindspore.SummaryCollector.html)。
-
-3. CheckpointMonitor
-
-   该回调函数类主要用于在模型训练过程中保存模型权重文件，有如下几个可配置项：
-
-   | 参数                            | 说明                                                                                                                                             | 类型   |
-   |-------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------|------|
-   | prefix                        | 设置保存文件名称的前缀。                                                                                                                                    | str  |
-   | directory                     | 设置保存文件名称的目录。                                                                                                                                    | str  |
-   | save_checkpoint_seconds       | 设置保存模型权重的间隔秒数。                                                                                                                                  | int  |
-   | save_checkpoint_steps         | 设置保存模型权重的间隔steps数。                                                                                                                              | int  |
-   | keep_checkpoint_max           | 设置保存模型权重文件的最大数量，如果保存路径内存在超出数量的模型权重文件，会从创建时间最早的文件开始删除，以保证文件总数不超过`keep_checkpoint_max`。                                                           | int  |
-   | keep_checkpoint_per_n_minutes | 设置保存模型权重的间隔分钟数。                                                                                                                                 | int  |
-   | integrated_save               | 开启聚合保存权重文件。<br/>1. 设为True时表示在保存权重文件时聚合所有device的权重，即所有device权重一致。<br/>2. 设为False时表示所有device各自保存自己的权重。<br/>使用半自动并行模式时通常需要设置为False，以避免保存权重文件时出现内存问题。 | bool |
-   | save_network_params           | 是否仅保存模型权重，默认值为`False`。                                                                                                                          | bool |
-   | save_trainable_params         | 是否额外保存可训练的参数权重，即部分微调时模型的参数权重，默认为`False`。                                                                                                       | bool |
-   | async_save                    | 是否异步执行保存模型权重文件。                                                                                                                                 | bool |
-   | remove_redundancy             | 是否去除模型权重的冗余，默认值为`False`。                                                                                                                        | bool |                                                                                                                                            |      |
-   | checkpoint_format             | 保存的模型权重的格式，默认值为`ckpt`。可选`ckpt`，`safetensors`。                                                                                                   | str  |
-
-在`callbacks`字段下可同时配置多个Callbacks函数类，以下是`callbacks`配置示例。
-
-```yaml
-callbacks:
-  - type: MFLossMonitor
-  - type: CheckpointMonitor
-    prefix: "name_xxb"
-    save_checkpoint_steps: 1000
-    integrated_save: False
-    async_save: False
-```
-
-### Processor配置
-
-Processor主要用于对输入模型的推理数据进行预处理，由于Processor配置项不固定，这里仅对MindSpore Transformers中的Processor通用配置项进行说明。
-
-| 参数                             | 说明                                   | 类型  |
-|--------------------------------|--------------------------------------|-----|
-| processor.type                 | 设置数据处理类。                              | str |
-| processor.return_tensors       | 设置数据处理类返回的张量类型，一般使用'ms'。              | str |
-| processor.image_processor.type | 设置图像数据处理类。                            | str |
-| processor.tokenizer.type       | 设置文本tokenizer类。                       | str |
-| processor.tokenizer.vocab_file | 设置文本tokenizer读取文件路径，需要与tokenizer类相对应。 | str |
-
-### 模型评估配置
-
-MindSpore Transformers提供模型评估功能，同时支持模型边训练边评估功能，以下是模型评估相关配置。
-
-| 参数                  | 说明                                                         | 类型   |
-|---------------------|------------------------------------------------------------|------|
-| eval_dataset        | 使用方式与`train_dataset`相同。                                     | -    |
-| eval_dataset_task   | 使用方式与`eval_dataset_task`相同。                                | -    |
-| metric.type         | 使用方式与`callbacks`相同。                                        | -    |
-| do_eval             | 是否开启边训练边评估功能 。                                              | bool |
-| eval_step_interval  | 设置评估的step间隔，默认值为100，设置小于0表示关闭根据step间隔评估功能。                  | int  |
-| eval_epoch_interval | 设置评估的epoch间隔，默认值为-1，设置小于0表示关闭根据epoch间隔评估功能，不建议在数据下沉模式使用该配置。 | int  |
-| metric.type         | 设置评估的类型。                                                    | str  |
-
-### Profile配置
-
-MindSpore Transformers提供Profile作为模型性能调优的主要工具，详情可参考[性能调优指南](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/perf_optimize/perf_optimize.html)。以下是Profile相关配置。
-
-| 参数                    | 说明                                                                                                                                        | 类型   |
-|-----------------------|-------------------------------------------------------------------------------------------------------------------------------------------|------|
-| profile               | 是否开启性能采集工具，默认值为`False`，详情可参考[mindspore.Profiler](https://www.mindspore.cn/docs/zh-CN/r2.6.0/api_python/mindspore/mindspore.Profiler.html)。 | bool |
-| profile_start_step    | 设置开始采集性能数据的step数，默认值为`1`。                                                                                                                  | int  |
-| profile_stop_step     | 设置停止采集性能数据的step数，默认值为`10`。                                                                                                                 | int  |
-| profile_communication | 设置是否在多设备训练中收集通信性能数据，使用单卡训练时，该参数无效，默认值为`False`。                                                                                             | bool |
-| profile_memory        | 设置是否收集Tensor内存数据，默认值为`True`。                                                                                                               | bool |
-| profile_rank_ids      | 设置开启性能采集的rank ids，默认值为`None`，表示所有rank id均开启性能采集。                                                                                           | list |
-| profile_pipeline      | 设置是否按流水线并行每个stage的其中一张卡开启性能采集，默认值为`False`。                                                                                                 | bool |
-| profile_output        | 设置保存性能采集生成文件的文件夹路径。                                                                                                                        | str  |
-| profile_level         | 设置采集数据的级别，可选值为(0, 1, 2)，默认值为`1`。                                                                                                           | int  |
-| with_stack            | 设置是否收集Python侧的调用栈数据，默认值为`False`。                                                                                                           | bool |
-| data_simplification   | 设置是否开启数据精简，开启后将在导出性能采集数据后删除FRAMEWORK目录以及其他多余数据，默认为`False`。                                                                                 | int  |
-| init_start_profile    | 设置是否在Profiler初始化时开启采集性能数据，设置`profile_start_step`时该参数不生效，开启`profile_memory`时需要将该参数设为`True`。                                                 | bool |
-| mstx                  | 设置是否收集mstx时间戳记录，包括训练step、HCCL通信算子等，默认值为`False`。                                                                                                            | bool |
-
-### 指标监控配置
-
-指标监控配置主要用于配置训练过程中各指标的记录方式，详情可参考[训练指标监控](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/function/monitor.html)。以下是MindSpore Transformers中通用的指标监控配置项说明：
-
-| 参数名称                                    | 说明                                                                                                                         | 类型            |
-|-----------------------------------------|----------------------------------------------------------------------------------------------------------------------------|---------------|
-| monitor_config.monitor_on               | 设置是否开启监控。默认为`False`，此时以下所有参数不生效。                                                                                            | bool          |
-| monitor_config.dump_path                | 设置训练过程中`local_norm`、`device_local_norm`、`local_loss`指标文件的保存路径。未设置或设置为`null`时取默认值'./dump'。                                   | str           |
-| monitor_config.target                   | 设置指标`优化器状态`和`local_norm`所监控的的目标参数的名称（片段），可为正则表达式。未设置或设置为`null`时取默认值['.*']，即指定所有参数。                                          | list[str]     |
-| monitor_config.invert                   | 设置反选`monitor_config.target`所指定的参数，默认为`False`。                                                                               | bool          |
-| monitor_config.step_interval            | 设置记录指标的频率。默认为1，即每个step记录一次。                                                                                                 | int           |
-| monitor_config.local_loss_format        | 设置指标`local_loss`的记录形式，可选值为字符串'tensorboard'和'log'（分别表示写入 Tensorboard 和写入日志），或由两者组成的列表，或`null`。未设置时默认为`null`，表示不监控该指标。        | str或list[str] |
-| monitor_config.local_norm_format        | 设置指标`local_norm`的记录形式，可选值为字符串'tensorboard'和'log'（分别表示写入 Tensorboard 和写入日志），或由两者组成的列表，或`null`。未设置时默认为`null`，表示不监控该指标。        | str或list[str] |
-| monitor_config.device_local_norm_format | 设置指标`device_local_norm`的记录形式，可选值为字符串'tensorboard'和'log'（分别表示写入 Tensorboard 和写入日志），或由两者组成的列表，或`null`。未设置时默认为`null`，表示不监控该指标。 | str或list[str] |
-| monitor_config.optimizer_state_format   | 设置指标`优化器状态`的记录形式，可选值为字符串'tensorboard'和'log'（分别表示写入 Tensorboard 和写入日志），或由两者组成的列表，或`null`。未设置时默认为`null`，表示不监控该指标。             | str或list[str] |
-| monitor_config.weight_state_format      | 设置指标`权重L2-norm`的记录形式，可选值为字符串'tensorboard'和'log'（分别表示写入 Tensorboard 和写入日志），或由两者组成的列表，或`null`。未设置时默认为`null`，表示不监控该指标。         | str或list[str] |
-| monitor_config.throughput_baseline      | 设置指标`吞吐量线性度`的基线值，需要为正数。未设置时默认为`null`，表示不监控该指标。                                                                              | int或float     |
-| monitor_config.print_struct             | 设置是否打印模型的全部可训练参数名。若为`True`，则会在第一个step开始时打印所有可训练参数的名称，并在step结束后退出训练。默认为`False`。                                              | bool          |
-
-### TensorBoard配置
-
-TensorBoard配置主要用于配置训练过程中与TensorBoard相关的参数，便于在训练过程中实时查看和监控训练信息，详情可参考[训练指标监控](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/function/monitor.html)。以下是MindSpore Transformers中通用的TensorBoard配置项说明：
-
-| 参数名称                                      | 说明                                                      | 类型   |
-|-------------------------------------------|---------------------------------------------------------|------|
-| tensorboard.tensorboard_dir               | 设置 TensorBoard 事件文件的保存路径。                                | str  |
-| tensorboard.tensorboard_queue_size        | 设置采集队列的最大缓存值，超过该值便会写入事件文件，默认值为10。                        | int  |
-| tensorboard.log_loss_scale_to_tensorboard | 设置是否将 loss scale 信息记录到事件文件，默认为`False`。                   | bool |
-| tensorboard.log_timers_to_tensorboard     | 设置是否将计时器信息记录到事件文件，计时器信息包含当前训练步骤（或迭代）的时长以及吞吐量，默认为`False`。 | bool |
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_zh_cn/appendix/env_variables.md b/docs/mindformers/docs/source_zh_cn/appendix/env_variables.md
deleted file mode 100644
index 9059a8ff9a6408e0f5231476fd7d87e1fddf17d2..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_zh_cn/appendix/env_variables.md
+++ /dev/null
@@ -1,41 +0,0 @@
-# 环境变量说明
-
-[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_zh_cn/appendix/env_variables.md)
-
-以下是 MindSpore Transformers 支持的环境变量。
-
-## 调试变量
-
-| 变量名称                        | 默认值 | 解释                                                                                                                                                                                                                         | 说明                                                                                                                                                                           | 应用场景                                                                                                                                                                                                                                              |
-| ------------------------------- | ------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| **HCCL_DETERMINISTIC**          | false  | 开启或关闭归约类通信算子的确定性计算，其中归约类通信算子包括 AllReduce、ReduceScatter、Reduce。                                                                                                                              | `true`：打开 HCCL 确定性开关；<br>`false`：关闭 HCCL 确定性开关。<br>                                                                                                          | 开启确定性计算可消除多卡计算顺序不一致引入的随机性，但也会导致性能相较关闭时下降。推荐在需要保持一致性场景时开启。                                                                                                                                    |
-| **LCCL_DETERMINISTIC**          | 0      | 设置 LCCL 确定性算子 AllReduce(保序加)是否开启。                                                                                                                                                                             | `1`：打开 LCCL 确定性开关；<br>`0`：关闭 LCCL 确定性开关。                                                                                                                     | 开启确定性计算可消除多卡计算顺序不一致引入的随机性，但也会导致性能相较关闭时下降。推荐在需要保持一致性场景时开启。<br>在 rankSize<=8 时生效。                                                                                                         |
-| **CUSTOM_MATMUL_SHUFFLE**       | on     | 开启或者关闭自定义矩阵乘法的洗牌操作。                                                                                                                                                                                       | `on`：开启矩阵洗牌；<br>`off`：关闭矩阵洗牌。                                                                                                                                  | 洗牌操作对于特定的矩阵尺寸和内存访问模式有优化效果，如果矩阵的大小与洗牌优化的尺寸不匹配，关闭洗牌可能会获得更好的性能。请根据实际使用进行设置。                                                                                                      |
-| **ASCEND_LAUNCH_BLOCKING**      | 0      | 训练或在线推理场景，可通过此环境变量控制算子执行时是否启动同步模式。                                                                                                                                                         | `1`：强制算子采用同步模式运行；<br>`0`：不强制算子采用同步模式运行。                                                                                                           | 由于 NPU 模型训练时默认算子异步执行，导致算子执行过程中出现报错时，打印的报错堆栈信息并不是实际的调用栈信息。当设置为`1`时，强制算子采用同步模式运行，这样能够打印正确的调用栈信息，从而更容易地调试和定位代码中的问题。设置为`1`时有更高的运算效率。 |
-| **TE_PARALLEL_COMPILER**        | 8      | 算子最大并行编译进程数，当大于 1 时开启并行编译。                                                                                                                                                                            | 取值为正整数；最大不超过 cpu 核数\*80%/昇腾 AI 处理器个数，取值范围 1~32，默认值是 8。                                                                                      | 网络模型较大时，可通过配置此环境变量开启算子的并行编译功能；<br>设置为`1`时为单线程编译，在调试时，可以简化难度。                                                                                                                                     |
-| **CPU_AFFINITY**                | 0      | 启动 CPU 亲和性开关，启动该选项可以确保每个进程或线程绑定到一个 CPU 核心上，以提高性能。                                                                                                                                     | `1`：开启 CPU 亲和性开关；<br>`0`：关闭 CPU 亲和性开关。                                                                                                                       | 出于**优化资源利用** 以及**节能** 的考虑，CPU 亲和性默认关闭。                                                                                                                                                                                        |
-| **MS_MEMORY_STATISTIC**         | 0      | 内存统计。                                                                                                                                                                                                                   | `1`：开启内存统计功能；<br>`0`：关闭内存统计功能。                                                                                                                             | 在内存分析时，可以统计内存的基本使用情况。具体可以参考[调优指南](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/perf_optimize/perf_optimize.html)。                                                                                              |
-| **MINDSPORE_DUMP_CONFIG**       |        | 指定 [云侧 Dump 功能](https://www.mindspore.cn/tutorials/zh-CN/r2.6.0/debug/dump.html) 或 [端侧 Dump 功能](https://www.mindspore.cn/lite/docs/zh-CN/r2.6.0/tools/benchmark_tool.html#dump功能) 所依赖的配置文件的路径 | 文件路径，支持相对路径与绝对路径。                                                                                                                                             |                                                                                                                                                                                                                                                       |
-| **GLOG_v**                      | 3      | 控制 MindSpore 日志的级别。                                                                                                                                                                                                  | `0`：DEBUG；<br>`1`：INFO；<br>`2`：WARNING；<br>`3`：ERROR：表示程序执行出现报错，输出错误日志，程序可能不会终止；<br>`4`：CRITICAL，表示程序执行出现异常，将会终止执行程序。 |                                                                                                                                                                                                                                                       |
-| **ASCEND_GLOBAL_LOG_LEVEL**     | 3      | 控制 CANN 的日志级别。                                                                                                                                                                                                       | `0`：DEBUG；<br>`1`：INFO；<br>`2`：WARNING；<br>`3`：ERROR；<br>`4`：NULL，不输出日志。                                                                                       |                                                                                                                                                                                                                                                       |
-| **ASCEND_SLOG_PRINT_TO_STDOUT** | 0      | 是否开启日志打屏。开启后，日志将不会保存在 log 文件中，而是将产生的日志直接打屏显示。                                                                                                                                        | `1`：开启日志打屏；<br>`0`：关闭日志打屏。                                                                                                                                     |                                                                                                                                                                                                                                                       |
-| **ASCEND_GLOBAL_EVENT_ENABLE**  | 0      | 设置应用类日志是否开启 Event 日志。                                                                                                                                                                                          | `1`：开启 Event 日志；<br>`0`：关闭 Event 日志。                                                                                                                               |                                                                                                                                                                                                                                                       |
-| **HCCL_EXEC_TIMEOUT**           | 1836   | 通过该环境变量可控制设备间执行时同步等待的时间，在该配置时间内各设备进程等待其他设备执行通信同步。                                                                                                                           | 取值范围为：(0, 17340]，默认值为 1836，单位为 s。                                                                                                                              |                                                                                                                                                                                                                                                       |
-| **HCCL_CONNECT_TIMEOUT**        | 120    | 分布式训练或推理场景下，用于限制不同设备之间 socket 建链过程的超时等待时间。                                                                                                                                                 | 该环境变量需要配置为整数，取值范围[120,7200]，默认值 120s。                                                                                                                    |                                                                                                                                                                                                                                                       |
-| **MS_NODE_ID**        |  NA | 动态组网启动场景下，指定本进程的rank_id。 | 本进程的rank_id，在集群内唯一。 |                       |
-
-## 其他变量
-
-| 变量名称                           | 默认值         | 解释                                 | 说明                                                                                                               | 应用场景                                                                                                                              |
-| ---------------------------------- |-------------|------------------------------------|------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------|
-| **RUN_MODE**                       | predict     | 设置运行模式。                            | `predict`：推理； <br>`finetune`：微调； <br>`train`：训练； <br>`eval`：评测。                                                  |                                                                                                                                   |
-| **USE_ROPE_SELF_DEFINE**           | true        | 是否使用 ROPE 融合算子。                    | `true`：使用 ROPE 融合算子；<br>`false`：不使用 ROPE 融合算子。                                                                   | 默认开启 ROPE 融合算子可以提升计算效率。除调试场景，根据需要进行关闭，一般不作特别设置。                                                                                   |
-| **MS_ENABLE_INTERNAL_BOOST**       | on         | 是否打开 MindSpore 框架的内部加速功能。          | `on`：开启 MindSpore 内部加速；<br>`off`：关闭 MindSpore 内部加速。                                                              | 为了实现高性能推理，该配置默认开启。在进行调试或对比不同加速策略的情况下，需要关闭此参数以观察对性能的影响。                                                                         |
-| **MS_GE_ATOMIC_CLEAN_POLICY**      | 1           | 是否集中清理网络中 atomic 算子占用的内存。          | `0`：集中清理网络中所有 atomic 算子占用的内存；<br>`1`：不集中清理内存，对网络中每一个 atomic 算子进行单独清零。                                            | 开关默认设置为`1`，方便用户对每个算子进行单独处理，可以进行算子内存复用等操作。设置为`0`后，集中清理算子所占内存。                                                                      |
-| **ENABLE_LAZY_INLINE**             | 1           | 是否开启 lazy inline。                  | `0`：关闭 lazy inline；<br>`1`：开启 lazy inline。                                                                       | 此特性在 mindspore≥2.2.0 下适用。通常在 pipeline 并行时使用以提高编译性能。默认开启，可配置关闭。                                                                    |
-| **ENABLE_LAZY_INLINE_NO_PIPELINE** | 0           | 是否开启在非 pipeline 并行下的 lazy inline。  | `0`：关闭 lazy inline；<br>`1`：开启 lazy inline。                                                                       | lazy inline 特性默认仅在 pipeline 并行模式下开启。如需在其他并行模式下使能 lazy inline，可将该环境变量设置为 1。                                                        |
-| **MS_ASCEND_CHECK_OVERFLOW_MODE**  | INFNAN_MODE | 设置浮点计算结果输出模式。                      | `SATURATION_MODE`：饱和模式，计算出现溢出时，饱和为浮点数极值（+-MAX）；<br>`INFNAN_MODE`：INF/NAN 模式，遵循 IEEE 754 标准，根据定义输出 INF/NAN 的计算结果。 | 在大模型调优中，溢出状态对齐 PyTorch 方式，建议使用 INFNAN_MODE，即 export MS_ASCEND_CHECK_OVERFLOW_MODE=INFNAN_MODE。<br>遇到持续溢出问题时可尝试设置此变量为 INFNAN_MODE。 |
-| **MF_LOG_SUFFIX**                  | NA          | 设置所有 log 日志文件夹的自定义后缀。              | log 文件夹的后缀。默认值：无后缀                                                                                               | 添加一致的后缀，可以隔离各个任务的日志，不会被覆写。                                                                                                        |
-| **PLOG_REDIRECT_TO_OUTPUT**          | False       | 控制 plog 日志是否改变存储路径。                | `True`:存储到./output 目录下; <br>`False`: 存储到默认存储位置。                                                                  | 设置之后方便用户查询 plog 日志。                                                                                                               |
-| **MS_ENABLE_FA_FLATTEN**          | on          | 控制 是否支持 FlashAttention flatten 优化。 | `on`:启用 FlashAttention flatten 优化; <br>`off`: 禁用 FlashAttention flatten 优化。                                      | 对于还未适配FlashAttention flatten 优化的模型提供回退机制。                                                                                         |
-| **EXPERIMENTAL_KERNEL_LAUNCH_GROUP**          | NA          | 控制是否支持算子批量并行下发，支持开启并行下发，并配置并行数 | `thread_num`: 并发线程数，一般不建议增加，默认值为`2`； <br> `kernel_group_num`: 算子分组总数量，每线程`kernel_group_num/thread_num`个组，默认值为`8`。               | 该特性后续还会继续演进，后续行为可能会有变更，当前仅支持`deepseek`推理场景，有一定的性能优化，但是其他模型使用该特性可能会有劣化，用户需要谨慎使用，使用方法如下：`export EXPERIMENTAL_KERNEL_LAUNCH_GROUP="thread_num:2,kernel_group_num:8"`。            |
diff --git a/docs/mindformers/docs/source_zh_cn/conf.py b/docs/mindformers/docs/source_zh_cn/conf.py
deleted file mode 100644
index c21f2f0253c42f2534e0ba297e5b3d3d505b1071..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_zh_cn/conf.py
+++ /dev/null
@@ -1,323 +0,0 @@
-# Configuration file for the Sphinx documentation builder.
-#
-# This file only contains a selection of the most common options. For a full
-# list see the documentation:
-# https://www.sphinx-doc.org/en/master/usage/configuration.html
-
-# -- Path setup --------------------------------------------------------------
-
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-#
-import glob
-import os
-import re
-import shutil
-import sys
-from sphinx.ext import autodoc as sphinx_autodoc
-import sphinx.ext.autosummary.generate as g
-
-sys.path.append(os.path.abspath('../_ext'))
-
-# Fix some dl-label lack class='simple'
-from docutils.writers import _html_base
-
-with open(_html_base.__file__, "r", encoding="utf-8") as f:
-    code_str = f.read()
-    old_str = '''        if self.is_compactable(node):
-            classes.append('simple')'''
-    new_str = '''        if classes == []:
-            classes.append('simple')'''
-    code_str = code_str.replace(old_str, new_str)
-    exec(code_str, _html_base.__dict__)
-
-# -- Project information -----------------------------------------------------
-
-project = 'MindSpore'
-copyright = 'MindSpore'
-author = 'MindSpore'
-
-# The full version, including alpha/beta/rc tags
-release = 'master'
-
-
-# -- General configuration ---------------------------------------------------
-
-# Add any Sphinx extension module names here, as strings. They can be
-# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
-# ones.
-myst_enable_extensions = ["dollarmath", "amsmath"]
-
-
-myst_heading_anchors = 5
-extensions = [
-    'sphinx.ext.autodoc',
-    'sphinx.ext.autosummary',
-    'sphinx.ext.doctest',
-    'sphinx.ext.intersphinx',
-    'sphinx.ext.todo',
-    'sphinx.ext.coverage',
-    'sphinx.ext.napoleon',
-    'sphinx.ext.viewcode',
-    'myst_parser',
-    'nbsphinx',
-    'sphinx.ext.mathjax',
-    'IPython.sphinxext.ipython_console_highlighting'
-]
-
-source_suffix = {
-    '.rst': 'restructuredtext',
-    '.md': 'markdown',
-}
-
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-# This pattern also affects html_static_path and html_extra_path.
-mathjax_path = 'https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/mathjax/MathJax-3.2.2/es5/tex-mml-chtml.js'
-
-mathjax_options = {
-    'async':'async'
-}
-
-nbsphinx_requirejs_path = 'https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js'
-
-nbsphinx_requirejs_options = {
-    "crossorigin": "anonymous",
-    "integrity": "sha256-1fEPhSsRKlFKGfK3eO710tEweHh1fwokU5wFGDHO+vg="
-}
-
-smartquotes_action = 'De'
-
-exclude_patterns = []
-
-pygments_style = 'sphinx'
-
-autodoc_inherit_docstrings = False
-
-autosummary_generate = True
-
-autosummary_generate_overwrite = False
-
-html_search_language = 'zh'
-
-html_search_options = {'dict': '../../../resource/jieba.txt'}
-
-# -- Options for HTML output -------------------------------------------------
-
-# Reconstruction of sphinx auto generated document translation.
-
-language = 'zh_CN'
-locale_dirs = ['../../../../resource/locale/']
-gettext_compact = False
-
-# The theme to use for HTML and HTML Help pages.  See the documentation for
-# a list of builtin themes.
-#
-html_theme = 'sphinx_rtd_theme'
-
-import sphinx_rtd_theme
-layout_target = os.path.join(os.path.dirname(sphinx_rtd_theme.__file__), 'layout.html')
-layout_src = '../../../../resource/_static/layout.html'
-if os.path.exists(layout_target):
-    os.remove(layout_target)
-shutil.copy(layout_src, layout_target)
-
-# Example configuration for intersphinx: refer to the Python standard library.
-intersphinx_mapping = {
-    'python': ('https://docs.python.org/', '../../../../resource/python_objects.inv'),
-    'numpy': ('https://docs.scipy.org/doc/numpy/', '../../../../resource/numpy_objects.inv'),
-}
-
-# Modify regex for sphinx.ext.autosummary.generate.find_autosummary_in_lines.
-gfile_abs_path = os.path.abspath(g.__file__)
-autosummary_re_line_old = r"autosummary_re = re.compile(r'^(\s*)\.\.\s+autosummary::\s*')"
-autosummary_re_line_new = r"autosummary_re = re.compile(r'^(\s*)\.\.\s+(ms[a-z]*)?autosummary::\s*')"
-with open(gfile_abs_path, "r+", encoding="utf8") as f:
-    data = f.read()
-    data = data.replace(autosummary_re_line_old, autosummary_re_line_new)
-    exec(data, g.__dict__)
-
-# Modify default signatures for autodoc.
-autodoc_source_path = os.path.abspath(sphinx_autodoc.__file__)
-autodoc_source_re = re.compile(r'stringify_signature\(.*?\)')
-get_param_func_str = r"""\
-import re
-import inspect as inspect_
-
-def get_param_func(func):
-    try:
-        source_code = inspect_.getsource(func)
-        if func.__doc__:
-            source_code = source_code.replace(func.__doc__, '')
-        all_params_str = re.findall(r"def [\w_\d\-]+\(([\S\s]*?)(\):|\) ->.*?:)", source_code)
-        if "@classmethod" in source_code:
-            all_params = re.sub("(self|cls)(,|, )?", '', all_params_str[0][0].replace("\n", ""))
-        else:
-            all_params = re.sub("(self)(,|, )?", '', all_params_str[0][0].replace("\n", ""))
-        return all_params
-    except:
-        return ''
-
-def get_obj(obj):
-    if isinstance(obj, type):
-        return obj.__init__
-
-    return obj
-"""
-
-with open(autodoc_source_path, "r+", encoding="utf8") as f:
-    code_str = f.read()
-    code_str = autodoc_source_re.sub('"(" + get_param_func(get_obj(self.object)) + ")"', code_str, count=0)
-    exec(get_param_func_str, sphinx_autodoc.__dict__)
-    exec(code_str, sphinx_autodoc.__dict__)
-
-from sphinx import directives
-with open('../_ext/overwriteobjectiondirective.txt', 'r', encoding="utf8") as f:
-    exec(f.read(), directives.__dict__)
-
-from sphinx.ext import viewcode
-with open('../_ext/overwriteviewcode.txt', 'r', encoding="utf8") as f:
-    exec(f.read(), viewcode.__dict__)
-
-with open("../_ext/customdocumenter.txt", "r", encoding="utf8") as f:
-    code_str = f.read()
-    exec(code_str, sphinx_autodoc.__dict__)
-
-from myautosummary import MsCnAutoSummary
-
-def setup(app):
-    app.add_directive('mscnautosummary', MsCnAutoSummary)
-    app.add_config_value('rst_files', set(), False)
-
-# Copy source files of chinese python api from golden-stick repository.
-from sphinx.util import logging
-import shutil
-logger = logging.getLogger(__name__)
-
-copy_path = 'docs/api/api_python'
-src_dir_api = os.path.join(os.getenv("MFM_PATH"), copy_path)
-
-copy_list = []
-moment_dir=os.path.dirname(__file__)
-
-for i in os.listdir(src_dir_api):
-    if os.path.isfile(os.path.join(src_dir_api,i)):
-        if os.path.exists('./'+i):
-            os.remove('./'+i)
-        shutil.copy(os.path.join(src_dir_api,i),'./'+i)
-        copy_list.append(os.path.join(moment_dir,i))
-    else:
-        if os.path.exists('./'+i):
-            shutil.rmtree('./'+i)
-        shutil.copytree(os.path.join(src_dir_api,i),'./'+i)
-        copy_list.append(os.path.join(moment_dir,i))
-
-# Rename .rst file to .txt file for include directive.
-from rename_include import rename_include
-
-rename_include('experimental')
-
-if os.path.exists('./mindformers.experimental.rst'):
-    os.remove('./mindformers.experimental.rst')
-
-if os.path.exists('./experimental'):
-    shutil.rmtree('./experimental')
-
-if os.path.exists('./usage/pretrain_gpt.md'):
-    os.remove('./usage/pretrain_gpt.md')
-
-with open('./index.rst', 'r+', encoding='utf-8') as f:
-    ind_content = f.read()
-    ind_content = re.sub('.*usage/pretrain_gpt.*\n', '', ind_content)
-    f.seek(0)
-    f.truncate()
-    f.write(ind_content)
-
-# add view
-import json
-
-if os.path.exists('../../../../tools/generate_html/version.json'):
-    with open('../../../../tools/generate_html/version.json', 'r+', encoding='utf-8') as f:
-        version_inf = json.load(f)
-elif os.path.exists('../../../../tools/generate_html/daily_dev.json'):
-    with open('../../../../tools/generate_html/daily_dev.json', 'r+', encoding='utf-8') as f:
-        version_inf = json.load(f)
-elif os.path.exists('../../../../tools/generate_html/daily.json'):
-    with open('../../../../tools/generate_html/daily.json', 'r+', encoding='utf-8') as f:
-        version_inf = json.load(f)
-
-if os.getenv("MFM_PATH").split('/')[-1]:
-    copy_repo = os.getenv("MFM_PATH").split('/')[-1]
-else:
-    copy_repo = os.getenv("MFM_PATH").split('/')[-2]
-
-branch = [version_inf[i]['branch'] for i in range(len(version_inf)) if version_inf[i]['name'] == copy_repo.replace('-', '_')][0]
-docs_branch = [version_inf[i]['branch'] for i in range(len(version_inf)) if version_inf[i]['name'] == 'tutorials'][0]
-
-re_view = f"\n.. image:: https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/{docs_branch}/" + \
-          f"resource/_static/logo_source.svg\n    :target: https://gitee.com/mindspore/{copy_repo}/blob/{branch}/"
-
-re_url = r"(((gitee.com/mindspore/docs)|(github.com/mindspore-ai/(mindspore|docs))|" + \
-         r"(mindspore.cn/(docs|tutorials|lite))|(obs.dualstack.cn-north-4.myhuaweicloud)|" + \
-         r"(mindspore-website.obs.cn-north-4.myhuaweicloud))[\w\d/_.-]*?)/(master)"
-
-re_url2 = r"(gitee.com/mindspore/mindspore[\w\d/_.-]*?)/(master)"
-
-re_url3 = r"(((gitee.com/mindspore/golden-stick)|(mindspore.cn/golden_stick))[\w\d/_.-]*?)/(master)"
-
-re_url4 = r"(((gitee.com/mindspore/mindformers)|(mindspore.cn/mindformers))[\w\d/_.-]*?)/(dev)"
-
-for cur, _, files in os.walk(moment_dir):
-    for i in files:
-        flag_copy = 0
-        if i.endswith('.rst'):
-            for j in copy_list:
-                if j in cur:
-                    flag_copy = 1
-                    break
-            if os.path.join(cur, i) in copy_list or flag_copy:
-                try:
-                    with open(os.path.join(cur, i), 'r+', encoding='utf-8') as f:
-                        content = f.read()
-                        new_content = content
-                        if '.. include::' in content and '.. automodule::' in content:
-                            continue
-                        if 'autosummary::' not in content and "\n======" in content:
-                            re_view_ = re_view + copy_path + cur.split(moment_dir)[-1] + '/' + i + \
-                                       '\n    :alt: 查看源文件\n\n'
-                            new_content = re.sub('([=]{5,})\n', r'\1\n' + re_view_, content, 1)
-                        new_content = re.sub(re_url, r'\1/r2.6.0', new_content)
-                        new_content = re.sub(re_url2, r'\1/v2.6.0', new_content)
-                        new_content = re.sub(re_url3, r'\1/r1.1.0', new_content)
-                        new_content = re.sub(re_url4, r'\1/r1.5.0', new_content)
-                        if new_content != content:
-                            f.seek(0)
-                            f.truncate()
-                            f.write(new_content)
-                except Exception:
-                    print(f'打开{i}文件失败')
-
-
-sys.path.append(os.path.abspath('../../../../resource/sphinx_ext'))
-# import anchor_mod
-import nbsphinx_mod
-
-sys.path.append(os.path.abspath('../../../../resource/search'))
-import search_code
-
-# src_release = os.path.join(os.getenv("MFM_PATH"), 'RELEASE_CN.md')
-# des_release = "./RELEASE.md"
-# with open(src_release, "r", encoding="utf-8") as f:
-#     data = f.read()
-# if len(re.findall("\n## (.*?)\n",data)) > 1:
-#     content = re.findall("(## [\s\S\n]*?)\n## ", data)
-# else:
-#     content = re.findall("(## [\s\S\n]*)", data)
-# #result = content[0].replace('# MindSpore', '#', 1)
-# with open(des_release, "w", encoding="utf-8") as p:
-#     p.write("# Release Notes"+"\n\n")
-#     p.write(content[0])
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_zh_cn/faq/func_related.md b/docs/mindformers/docs/source_zh_cn/faq/func_related.md
deleted file mode 100644
index 422ed0ec4418c727a9fc8bf3ec5e563a8e5f008f..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_zh_cn/faq/func_related.md
+++ /dev/null
@@ -1,21 +0,0 @@
-# 功能相关
-
-[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_zh_cn/faq/func_related.md)
-
-## Q: WikiText数据集下载链接失效。
-
-A: 官方下载链接失效，请关注社区Issue [#IBV35D](https://gitee.com/mindspore/mindformers/issues/IBV35D)。
-
-<br/>
-
-## Q: 如何生成模型切分策略文件？
-
-A: 模型切分策略文件记录了模型权重在分布式场景下的切分策略，一般在离线权重切分时使用。在网络`yaml`文件中配置`only_save_strategy: True`，然后正常启动分布式任务，便可在`output/strategy/`目录下生成分布式策略文件，详细介绍请参阅[分布式权重切分与合并教程](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/function/transform_weight.html#%E7%A6%BB%E7%BA%BF%E8%BD%AC%E6%8D%A2%E9%85%8D%E7%BD%AE%E8%AF%B4%E6%98%8E)。
-
-<br/>
-
-## Q: 生成`ranktable`文件报错`socket.gaierror: [Errno -2] Name or service not known`或者`socket.gaierror: [Errno -3] Temporary failure in name resolution`，怎么解决？
-
-A: 从`MindSpore Transformers r1.2.0`版本开始，集群启动统一使用`msrun`方式，`ranktable`启动方式已废弃。
-
-<br/>
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_zh_cn/faq/mindformers_contribution.md b/docs/mindformers/docs/source_zh_cn/faq/mindformers_contribution.md
deleted file mode 100644
index af8d16e8dfe74bf0b5266b638581bfa18ffec5b2..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_zh_cn/faq/mindformers_contribution.md
+++ /dev/null
@@ -1,155 +0,0 @@
-# MindSpore Transformers贡献指南
-
-[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_zh_cn/faq/mindformers_contribution.md)
-
-## 贡献代码至MindSpore Transformers
-
-### 代码风格要求
-
-请遵循此风格，以便MindSpore Transformers审查、维护和开发。
-
-- 编码指南
-
-  MindSpore Transformers社区使用`Python PEP 8` 编码风格。建议在IDE中安装以下插件，用于检查代码格式：`Lizard`、`ShellCheck` 和`PyLint`。
-
-- 单元测试指南
-
-  MindSpore Transformers社区使用Python单元测试框架pytest。注释名称需反映测试用例的设计意图。
-
-- 重构指南
-
-  我们鼓励开发人员重构我们的代码，以消除代码坏味道。所有代码都要符合编码风格和测试风格，重构代码也不例外。无注释的代码行（nloc）的Lizard阈值为100，圈复杂度（cnc）的阈值为20。当收到Lizard警告时，必须重构要合并的代码。
-
-- 文档指南
-
-  我们使用MarkdownLint来检查Markdown文档格式。基于默认配置修改了以下规则
-
-    1. MD007（无序列表缩进）：参数indent设置为4，表示无序列表中的所有内容都需要缩进4个空格。
-    2. MD009（行尾空格）：参数br_spaces设置为2，表示行尾可以有0或2个空格。
-    3. MD029（有序列表的序列号）：参数style设置为ordered，表示升序。
-
-### Fork-Pull 开发模型指导
-
-- Fork MindSpore Transformers代码仓
-
-  在提交代码至MindSpore Transformers项目之前，请确保已fork此项目到您自己的代码仓。MindSpore Transformers代码仓和您自己的代码仓之间可能会并行开发，请注意它们之间的一致性。
-
-- 克隆远程代码仓
-
-  如果您想将代码下载到本地计算机，最好使用git方法。
-
-  ```shell
-  # 在Gitee上克隆仓库
-  git clone https://gitee.com/(insert_your_forked_repo)/mindformers.git
-  ```
-
-- 本地开发代码
-
-  `dev`为开发分支，请从`dev`分支拉取最新代码进行开发。并在提交Pull Request时提交到`dev`分支。
-
-  ```shell
-  git checkout -b {新分支名称} origin/dev
-  ```
-
-- 提交PR到MindSpore Transformers代码仓
-
-  在最后一步中，您需要在新分支和`MindSpore Transformers`主分支之间拉取比较请求。完成拉取请求后，`Jenkins CI`将自动设置，进行构建测试。PR应该尽快合并到上游dev分支中，以降低合并的风险。
-
-  ```shell
-  # 添加所有更改到暂存区
-  git add
-
-  # 查看更新状态
-  git status
-
-  # 提交更改，使用-m选项添加commit标题
-  git commit -m "你的commit标题"
-
-  # 添加commit的具体描述，使用-s选项添加签名，-amend选项修改最近一次提交
-  git commit -s --amend
-
-  # 推送更改到远程仓库的新分支
-  git push origin {新分支名称}
-
-  ```
-
-### 文件及代码格式
-
-若希望将自定义模型合入`MindSpore Transformers`代码仓库，需要注意几点：
-
-1. 文件格式及位置要遵循规范。
-2. 将新模型在代码中进行注册，以适配高阶接口使用。
-
-#### 文件格式及位置
-
-1. 模型代码文件统一放置于`research/{model_name}`文件夹下，格式如下:
-
-    ```plaintext
-    research/{model_name}
-    ├── {model_name}
-    | ├── {pretrain/finetune/predict}_{model_name}_{n}b.yaml
-    | ├── convert_weight.py # Torch权重转MindSpore权重脚本（迁移模型需提供）
-    | ├── convert_reversed.py # MindSpore权重转Torch权重脚本（迁移模型需提供）
-    | ├── run_{model_name}.py # 运行代码文件
-    | ├── {model_name}.py   # Model类代码文件
-    | └── {model_name}_tokenizer.py # Tokenizer代码文件
-    ```
-
-2. 模型文档放置于同一`research/{model_name}`文件夹下。
-
-## 提交PR的要求
-
-### 只有一个commit
-
-对于多commit的PR，请使用`squash`命令将多个commit合并为一个。
-例如使用：
-
-```shell
-git rebase -i HEAD~3
-```
-
-可以看到:
-
-```shell
-pick 1234567 添加新功能A
-pick 89abcdef 修复了功能A中的bug
-pick 01234567 对功能A进行了一些优化
-```
-
-squash合并commit（可简化为 s, p, f 等简写）
-
-```shell
-pick 1234567 添加新功能A
-squash 89abcdef 修复了功能A中的bug
-squash 01234567 对功能A进行了一些优化
-```
-
-### PR描述
-
-请使用以下md模板:
-
-```markdown
-
-### 相关的Issue
-
-### 原因（目的、解决的问题等）
-
-### 描述（做了什么，变更了什么）
-
-### check list
-
-#### 是否完成方案评审或问题根因分析（Y/N）
-
-#### 是否完成了功能模块的UT/ST，并执行通过，附上结果（Y/N）
-
-#### 是否涉及公共组件或对外接口修改，涉及时需给出修改范围和影响评估（Y/N）
-
-#### 是否涉及资料修改，涉及时需同步修改（Y/N）
-
-```
-
-### 门禁要求
-
-1. 提交PR需要[签署CLA](https://www.mindspore.cn/icla)。
-
-2. 提交PR需要通过CI门禁检查，门禁失败修改代码后，需要在评论下评论`/retest`手动重启门禁检查。
diff --git a/docs/mindformers/docs/source_zh_cn/faq/model_related.md b/docs/mindformers/docs/source_zh_cn/faq/model_related.md
deleted file mode 100644
index fe5b23b9f9bda50d0e5014b5e409e33efe75aeca..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_zh_cn/faq/model_related.md
+++ /dev/null
@@ -1,17 +0,0 @@
-# 模型相关
-
-[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_zh_cn/faq/model_related.md)
-
-## Q: 网络运行时报错“Out of Memory”(`OOM`)，如何处理？
-
-A: 首先上述报错指的是设备内存不足，导致这一报错的原因可能有多种，建议进行如下几方面的排查:
-
-1. 使用命令`npu-smi info`，确认卡是否独占状态。
-2. 建议运行网络时，使用对应网络默认`yaml`配置。
-3. 网络对应`yaml`配置文件中适当增大`max_device_memory`的值，注意需要给卡间通信预留部分内存，可以渐进性增大进行尝试。
-4. 调整混合并行策略，适当增大流水线并行（pp）和模型并行（mp），并相应减小数据并行（dp），保持`dp * mp * pp = device_num`，有必要时增加NPU数量。
-5. 尝试调小批次大小或序列长度。
-6. 打开选择重计算或完全重计算，打开优化器并行。
-7. 如问题仍需进一步排查，欢迎[提issue](https://gitee.com/mindspore/mindformers/issues)反馈。
-
-<br/>
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_zh_cn/faq/modelers_contribution.md b/docs/mindformers/docs/source_zh_cn/faq/modelers_contribution.md
deleted file mode 100644
index be3e5f69e0d00e8a57ce1393858d704e23dd1340..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_zh_cn/faq/modelers_contribution.md
+++ /dev/null
@@ -1,103 +0,0 @@
-# 魔乐社区贡献指南
-
-[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_zh_cn/faq/modelers_contribution.md)
-
-## 上传模型至魔乐社区
-
-魔乐社区是一个模型托管平台，用户可以将自定义模型上传至[魔乐社区](https://modelers.cn/)进行托管。
-
-### MindSpore Transformers内置模型
-
-若用户的自定义模型使用了MindSpore Transformers提供的内置模型，即模型代码位于mindformers/models下的模型，且对模型结构代码未进行任何修改，则只需上传模型的权重文件和配置即可。
-
-如，用户使用MindSpore Transformers的内置ChatGLM2模型，进行了微调训练，想分享微调后的模型权重，那么上传模型配置和权重文件即可。
-
-下面是保存模型配置和权重的示例代码：
-
-```python
-import mindspore as ms
-from mindformers import ChatGLM2Config, ChatGLM2ForConditionalGeneration
-
-config = ChatGLM2Config()
-model = ChatGLM2ForConditionalGeneration(config)
-ms.load_checkpoint("path/model.ckpt", model)  # 加载自定义权重
-
-model.save_pretrained("./my_model", save_json=True)
-```
-
-上述代码运行后会保存config.json文件和mindspore_model.ckpt文件（较大权重会自动拆分保存）。
-
-保存后可使用openmind_hub库，进行模型上传，可参考[模型上传](https://modelers.cn/docs/zh/best-practices/community_contribution/model_contribution.html#%E4%BD%BF%E7%94%A8openmind-hub-client%E4%B8%8A%E4%BC%A0%E6%A8%A1%E5%9E%8B)。
-
-```python
-import openmind_hub
-
-openmind_hub.upload_folder(
-    folder_path="/path/to/local/folder",
-    repo_id="username/your-model-name",
-    token="your-token",
-)
-```
-
-已上传的例子可参考魔乐社区的[OpenLlama模型](https://modelers.cn/models/MindSpore-Lab/llama_7b/tree/main)。
-
-### 自定义模型
-
-若用户有自定义的模型代码，则需要同时上传模型代码文件，并在json配置文件中添加映射，使其可以通过Auto类导入。
-
-#### 命名规则
-
-上传到社区的自定义代码文件，一般有统一的命名规则。假设自定义模型名为model，其代码命名应当如下：
-
-```text
----- model
-    |- configuration_model.py  # Config类代码文件
-    |- modeling_model.py       # Model类代码文件
-    |- tokenization_model.py   # Tokenizer代码文件
-```
-
-#### 添加auto映射
-
-为让Auto类使用时，能够顺利找到用户自定义的模型类，需要在config.json文件中，添加auto映射。添加内容如下：
-
-```json
-{
-  "auto_map": {
-    "AutoConfig": "configuration_model.MyConfig",
-    "AutoModel": "modeling_model.MyModel",
-    "AutoModelForCausalLM": "modeling_model.MyModelForCausalLM",
-  },
-}
-```
-
-若有自定义tokenizer，则需要保存tokenizer：
-
-```python
-tokenizer.save_pretrained("./my_model", save_json=True)
-```
-
-并在保存的tokenizer_config.json中添加auto映射:
-
-```json
-{
-  "auto_map": {
-    "AutoTokenizer": ["tokenization_model.MyTokenizer", "tokenization_model.MyFastTokenizer"]
-  },
-}
-```
-
-#### 上传模型
-
-可使用openmind_hub库，进行模型上传，可参考[模型上传](https://modelers.cn/docs/zh/best-practices/community_contribution/model_contribution.html#%E4%BD%BF%E7%94%A8openmind-hub-client%E4%B8%8A%E4%BC%A0%E6%A8%A1%E5%9E%8B)。
-
-```python
-import openmind_hub
-
-openmind_hub.upload_folder(
-    folder_path="/path/to/local/folder",
-    repo_id="username/your-model-name",
-    token="your-token",
-)
-```
-
-已上传的例子可参考魔乐社区的[书生2模型](https://modelers.cn/models/MindSpore-Lab/internlm2-7b/tree/main)。
diff --git a/docs/mindformers/docs/source_zh_cn/full-process_1.png b/docs/mindformers/docs/source_zh_cn/full-process_1.png
deleted file mode 100644
index dbb6a24333a105f779396fc342b049c72938e5c8..0000000000000000000000000000000000000000
Binary files a/docs/mindformers/docs/source_zh_cn/full-process_1.png and /dev/null differ
diff --git a/docs/mindformers/docs/source_zh_cn/full-process_2.png b/docs/mindformers/docs/source_zh_cn/full-process_2.png
deleted file mode 100644
index 27e14e5bb14815a03be6ab6fffe290dd995a8c5f..0000000000000000000000000000000000000000
Binary files a/docs/mindformers/docs/source_zh_cn/full-process_2.png and /dev/null differ
diff --git a/docs/mindformers/docs/source_zh_cn/full-process_3.png b/docs/mindformers/docs/source_zh_cn/full-process_3.png
deleted file mode 100644
index f422ae1f15ee0285eb9d37da52f096835bd98f93..0000000000000000000000000000000000000000
Binary files a/docs/mindformers/docs/source_zh_cn/full-process_3.png and /dev/null differ
diff --git a/docs/mindformers/docs/source_zh_cn/full-process_4.png b/docs/mindformers/docs/source_zh_cn/full-process_4.png
deleted file mode 100644
index d438149f0718f823a8da83b7fec5f679281b2b8c..0000000000000000000000000000000000000000
Binary files a/docs/mindformers/docs/source_zh_cn/full-process_4.png and /dev/null differ
diff --git a/docs/mindformers/docs/source_zh_cn/full-process_5.png b/docs/mindformers/docs/source_zh_cn/full-process_5.png
deleted file mode 100644
index 4356392871de33da27839693b25238e103097f64..0000000000000000000000000000000000000000
Binary files a/docs/mindformers/docs/source_zh_cn/full-process_5.png and /dev/null differ
diff --git a/docs/mindformers/docs/source_zh_cn/function/dataset.md b/docs/mindformers/docs/source_zh_cn/function/dataset.md
deleted file mode 100644
index 867a6cf4945ed05239fe021facb8a62e05e315e8..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_zh_cn/function/dataset.md
+++ /dev/null
@@ -1,783 +0,0 @@
-# 数据集
-
-[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_zh_cn/function/dataset.md)
-
-目前MindSpore Transformers的预训练和微调支持多种格式的数据集加载能力，包括Megatron多源数据集、MindRecord数据集以及HuggingFace数据集的加载方式。每种格式的数据集的具体使用方法的参考如下。
-
-## Megatron多源数据集
-
-Megatron多源数据集是指从多个不同来源收集的数据集，这些数据集可以包含不同的文本类型、格式和领域。使用多源数据集可以帮助模型学习到更广泛的语言特征和知识，从而提高模型的泛化能力和性能。Megatron框架目前实现的多源数据集，需要先将原数据集预处理成BIN格式的数据集。当前MindSpore Transformers已经原生适配了Megatron多源数据集，提供了制作BIN格式数据集的脚本，支持在训练任务中直接使用Megatron多源数据集。
-
-### 制作 BIN 格式数据集
-
-MindSpore Transformers 提供了一个预处理脚本 [mindformers/tools/dataset_preprocess/preprocess_indexed_dataset.py](https://gitee.com/mindspore/mindformers/blob/r1.5.0/mindformers/tools/dataset_preprocess/preprocess_indexed_dataset.py) 将文本数据转换成BIN格式数据集，该脚本当前仅支持处理特定形式的 JSON 格式的文件。用户需要先将原始数据集文件转换成特定形式的JSON格式的文件，再使用预处理脚本生成BIN格式的数据集文件。当前 MindSpore Transformers 中的一些模型已经提供了将特定开源数据集转换成特定形式 JSON 格式文件的脚本，用户如想使用自有数据集，则需要通过自行编写脚本的方式将其转换为所需形式。
-
-所需的 JSON 格式文件内容的形式如下：
-
-```json
-{"id": "0", "text": "The quick brown fox", "type": "Eng", "src": "www.nvidia.com", "title": "First Part"}
-{"id": "1", "text": "jumps over the lazy dog", "type": "Eng", "src": "The Internet", "title": "Second Part"}
-...
-```
-
-其中每条数据由若干键值对组成，支持的键及说明如下：
-
-- `"id"`: 数据的编号，按顺序编号即可，必须存在
-- `"text"`: 实际用作训练的文本数据，必须存在
-- `"type"`: 注明语言类型，可选
-- `"src"`：注明数据的来源，可选
-- `"title"`：注明数据的标题，可选
-
-下面以处理 Wiki 数据集并用作 Llama2 模型预训练为例，说明制作 BIN 格式数据集的详细步骤：
-
-1. 下载 Wiki 数据集
-
-   原始 Wiki 数据集的下载参考 [Llama2 数据集下载](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/llama2.md#%E6%95%B0%E6%8D%AE%E5%8F%8A%E6%9D%83%E9%87%8D%E5%87%86%E5%A4%87)。
-
-2. 生成 JSON 格式文件
-
-   Wiki 数据集的原始格式如下：
-
-   ![](image/wikitext_sample.png)
-
-   将 Wiki 数据集处理后的 JSON 文件 `wiki.json` 的格式如下（省略长文本）：
-
-   ```json
-   {"id": 0, "text": "The gold dollar or gold one ..."}
-   {"id": 1, "text": "Super Mario Land is a 1989 ..."}
-   {"id": 2, "text": "The Sinclair Scientific Programmable ..."}
-   ...
-   ```
-
-3. 下载 Llama2 的词表文件
-
-   预处理脚本中会把原始文本数据使用模型的分词器 Tokenizer 处理成 Tokens 的形式，因此需要提前下载词表文件。
-
-   Llama2 词表文件的下载链接：[tokenizer.model](https://ascend-repo-modelzoo.obs.cn-east-2.myhuaweicloud.com/MindFormers/llama2/tokenizer.model)
-
-4. 使用预处理脚本生成 BIN 格式文件
-
-    处理成上述这样特定的 JSON 格式的文件后，调用 [mindformers/tools/dataset_preprocess/preprocess_indexed_dataset.py](https://gitee.com/mindspore/mindformers/blob/r1.5.0/mindformers/tools/dataset_preprocess/preprocess_indexed_dataset.py) 将其转换成BIN格式的数据集，具体命令如下：
-
-    ```shell
-    python mindformers/tools/dataset_preprocess/preprocess_indexed_dataset.py \
-    --input ./wiki.json \
-    --output-prefix wiki_processed_1024 \
-    --tokenizer-type LlamaTokenizer \
-    --vocab-file ./tokenizer.model \
-    --add_bos_token True \
-    --add_eos_token True \
-    --pad_or_stitch stitch \
-    --seq-length 1024 \
-    --workers 1
-    ```
-
-    配置参数说明：
-
-    - `--input`: JSON 格式文件的路径
-    - `--output-prefix`: 预处理后的输出文件的文件名前缀
-    - `--tokenizer-type`: 模型对应的 tokenizer 的类型
-    - `--vocab-file`: 模型分词器 Tokenizer 的词表文件路径
-    - `--add_bos_token`: 是否在数据的首位置添加 bos_token，默认为 False
-    - `--add_eos_token`: 是否在数据的末位置添加 eos_token，默认为 False
-    - `--pad_or_stitch`: 根据训练任务的要求，设置是否拼接还是补齐，pad为补齐模式，该模式会将长度不足的数据补齐至seq-length长度；stitch为拼接模式，该模式会将多条数据拼接成长度为seq-length的数据
-    - `--seq-length`: 预处理后每条数据长度
-    - `--workers`: 预处理时并行 worker 的数量
-
-执行以上命令之后，会得到两个文件，分别为 `.bin` 和 `.idx` 格式的文件，其中 `.bin` 格式文件存储实际的数据，`.idx` 格式文件存储每条数据的索引。
-
-### 在训练任务中使用多源数据集
-
-按照如下方式在训练任务中使用Megatron多源数据集：
-
-1. 准备`parallel_speed_up.json`文件
-
-   `parallel_speed_up.json` 是数据集并行通信配置文件，文件内容如下：
-
-   ```json
-   {
-       "dataset_broadcast_opt_level": 3
-   }
-   ```
-
-2. 设置环境变量
-
-    在命令行输入如下命令设置环境变量：
-
-    ```shell
-    export MS_DEV_DYNAMIC_SINK1=False
-    ```
-
-3. 修改训练任务的 YAML 配置文件
-
-    在 YAML 配置文件中配置Megatron多源数据集的相关参数。此处，以 Llama2-7B 模型预训练任务来举例说明，修改 [pretrain_llama2_7b.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/configs/llama2/pretrain_llama2_7b.yaml#L39) 中的 `train_dataset` 、 `runner_config` 、 `parallel_config` 、 `parallel` 以及 `context` 配置项。具体修改及说明如下：
-
-    ```yaml
-    train_dataset: &train_dataset
-      data_loader:
-        type: BlendedMegatronDatasetDataLoader
-        datasets_type: "GPTDataset"
-        sizes:
-          - 1000
-          - 0
-          - 0
-        shuffle: False
-        config:
-          seed: 1234
-          seq_length: 1024
-          split: "1, 0, 0"
-          data_path:
-            - 0.3
-            - "/path/to/my_wiki_test_1024_text_document"
-            - 0.7
-            - "/path/to/my_wiki_test_1024_text_document"
-          num_dataset_builder_threads: 1
-          eod_mask_loss: False
-          create_attention_mask: False
-      input_columns: ["input_ids", "labels", "loss_mask", "position_ids"]
-    ```
-
-    其中：
-
-    - data_loader.type：dataloader 的类型，需设置为 `BlendedMegatronDatasetDataLoader` 。
-    - data_loader.datasets_type：数据集类型，当前仅支持 `GPTDataset` 。
-    - data_loader.sizes：`- 1000` ， `- 0` ， `- 0` 分别为训练集、测试集以及验证集采样的大小，当前只支持配置训练集。
-    - input_columns：设置训练数据集输入的数据列，一般配置为 `["input_ids", "labels", "loss_mask", "position_ids"]` 。
-    - data_loader.config.seed: 创建数据集时的随机数种子，默认值： `1234` 。
-    - data_loader.config.seq_length：每条数据的长度，必须和 YAML 配置中的 model.model_config.seq_length 保持一致。
-    - data_loader.config.split：分割字符串，用逗号分隔训练集、测试集以及验证集的比重，用于从单个分布中绘制样本时分割数据集，当前只支持配置为 `"1, 0, 0"` 。
-    - data_loader.config.data_path：数字是每个数据集的比重，字符串是数据集 BIN 文件的路径，路径需要去掉文件格式后缀 `.bin` 。
-    - data_loader.config.num_dataset_builder_threads：创建数据集时使用的进程数，默认值： `1` 。
-    - data_loader.config.eod_mask_loss：是否使用 eod mask 的开关，默认值： `False` 。
-    - data_loader.config.create_attention_mask：是否构造 attention_mask，默认值：`True` 。
-
-    当前多源数据集目前还存在限制，仅支持非 full batch 的场景，且不支持序列流水线并行特性，需要根据以下对相应配置项进行修改：
-
-    ```yaml
-    runner_config:
-        sink_mode: True
-        sink_size: 1
-
-    parallel_config:
-        data_parallel: &dp 2
-        model_parallel: 2
-        pipeline_stage: 1
-
-    parallel:
-        full_batch: False
-        dataset_strategy: [[*dp, 1], [*dp, 1], [*dp, 1], [*dp, 1]]
-
-    context:
-        ascend_config:
-            parallel_speed_up_json_path: "/path/to/parallel_speed_up.json"
-    ```
-
-    需要注意的配置说明如下：
-
-    - parallel.dataset_strategy：仅支持 List of List 类型，List中子List的个数需要等于 train_dataset.input_columns 的长度，并且 List 中的每个子 List 需要和数据集返回的数据的shape保持一致。一般在数据的第1维进行数据并行切分，所以子List的第1位数配置成 `*dp` ，其他位配置为 `1` 。具体原理可以参考[数据集切分](https://www.mindspore.cn/tutorials/zh-CN/r2.6.0/parallel/dataset_slice.html)。
-
-4. 编译 Megatron 数据集模块
-
-    MindSpore Transformers 内置了 Megatron 的数据集模块代码，需要在启动训练任务之前执行如下命令进行编译：
-
-    ```shell
-    pip install pybind11
-    cd mindformers/dataset/blended_datasets
-    make
-    ```
-
-## MindRecord 数据集
-
-MindRecord 是由 MindSpore 开发的一种高效数据格式，用于存储机器学习或深度学习的数据集。
-
-MindRecord 格式旨在提高数据处理效率，尤其是在大规模数据训练场景下，可以更快地加载和处理数据。
-MindRecord 文件通常包含了模型训练所需的输入样本，这些样本经过预处理（如编码、归一化等），以优化读取速度和内存使用。
-
-更多关于 MindRecord 相关接口的实现及案例，请参考 [MindSpore 中关于 《MindRecord》 的相关文档](https://www.mindspore.cn/docs/zh-CN/r2.6.0/api_python/mindspore.mindrecord.html)
-
-### 如何制作 MindRecord 数据集
-
-MindRecord 模块提供了一些方法帮助用户将不同数据集转换为 MindRecord 格式，
-用户可以使用由 MindSpore 提供的 FileWriter 接口生成 MindRecord 格式数据集。
-
-下面将以 Llama2 为例，提供一个基于 json 格式文件制作 MindRecord 数据集的案例：
-
-1. 准备 json 文件；
-
-   准备类似这样的 json 文件，命名为 `mydata.json` ：
-
-   ```json
-   [
-      {
-        "text": "I love Beijing, because it is a city that beautifully blends rich history with modern vibrancy."
-      },
-      {
-        "text": "I love Hangzhou, because it is a city that seamlessly combines natural beauty with rich cultural heritage."
-      }
-   ]
-   ```
-
-2. 读取 json 文件；
-
-   ```python
-   import json
-
-   raw_data = None
-   file = open("mydata.json", "r")  # 打开 json 文件
-   if file is not None:
-      raw_data = json.load(file)  # 读取 json 文件到 raw_data 中
-      file.close()
-   ```
-
-3. 定义一个 MindRecord 的 ``schema`` ，并创建一个 ``FileWriter`` 对象；
-
-    ```python
-    from mindspore.mindrecord import FileWriter
-
-    # 定义一个 MindRecord 的 schema
-    schema = {'input_ids': {"type": "int32", "shape": [-1]}}
-    # 创建一个 FileWriter 对象
-    writer = FileWriter(file_name="output_file", shard_num=1)
-    writer.add_schema(schema, "dataset_type")
-    ```
-
-4. 遍历处理 json 文件中的每一条数据，将其转换为 MindRecord 格式，并写入 MindRecord 文件中。
-
-   词表下载链接：[tokenizer.model](https://ascend-repo-modelzoo.obs.cn-east-2.myhuaweicloud.com/MindFormers/llama2/tokenizer.model)
-
-    ```python
-    import numpy as np
-    from mindformers import LlamaTokenizer
-
-    def tokenize_json(tokenizer, raw_data):
-        """tokenize json file dataset"""
-        content = [] # 读取每个 json 数据，获取其 "input_ids"
-        for line in raw_data:
-            stripped_line = line['text'].strip()
-            if stripped_line:
-                line_ids = tokenizer(stripped_line)["input_ids"]
-                content.append(line_ids)
-
-        for ids in content:
-            sample = {}
-            sample['input_ids'] = np.array(ids, dtype=np.int32)
-            yield sample
-
-    # 将文本数据分词
-    word_tokenizer = LlamaTokenizer(vocab_file=r"tokenizer.model")
-
-    # 遍历处理 json 文件中的每一条数据，将其转化为 MindRecord 格式后写入 MindRecord 文件
-    # tokenize_json 为自定义的对 json 中数据进行分词的方法
-    for x in tokenize_json(word_tokenizer, raw_data):
-        writer.write_raw_data([x])
-    writer.commit()
-    ```
-
-详细案例可以参考 [Llama2 中的数据预处理案例](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/llama2.md#%E6%95%B0%E6%8D%AE%E5%8F%8A%E6%9D%83%E9%87%8D%E5%87%86%E5%A4%87)。
-
-### 在任务中使用 MindRecord 格式数据集
-
-通过在 yaml 配置文件中配置数据集相关参数，可以让训练或评测任务使用准备好的 MindRecord 格式数据集。
-
-此处，以 Llama2-7B 模型预训练任务来举例说明，在 [pretrain_llama2_7b.yaml 文件](https://gitee.com/mindspore/mindformers/blob/r1.5.0/configs/llama2/pretrain_llama2_7b.yaml#L39) 中的默认配置参数及说明如下：
-
-```yaml
-# dataset
-train_dataset: &train_dataset
-  data_loader:
-    type: MindDataset
-    dataset_dir: ""
-    shuffle: True
-  input_columns: ["input_ids"]
-  num_parallel_workers: 8
-  python_multiprocessing: False
-  drop_remainder: True
-  batch_size: 6
-  repeat: 1
-  numa_enable: False
-  prefetch_size: 1
-
-train_dataset_task:
-  type: CausalLanguageModelDataset
-  dataset_config: *train_dataset
-```
-
-配置如下参数以使用 MindRecord 格式数据集：
-
-- data_loader.type：dataloader 的类型，此处需要设置为 `MindDataset` 。
-- data_loader.dataset_dir：数据集文件路径。
-- input_columns：设置训练数据集输入的数据列。当前为预训练场景，设置为 `["input_ids"]` 。
-
-其余参数介绍可以参考 [配置文件说明](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/appendix/conf_files.html) 的 “模型训练配置” 和 “模型评估配置”。
-
-## HuggingFace数据集
-
-目前数据集加载功能已接入 [魔乐开源社区](https://modelers.cn/datasets)、[HuggingFace社区](https://huggingface.co/datasets)，并支持数据集在线加载与预处理，同时还可对数据集进行[packing](#数据集packing)，提升模型训练效率。
-
-### 使用说明
-
-HuggingFace数据集可实现HuggingFace社区以及魔乐开源社区中的数据集在线、离线加载，下面主要针对环境准备、数据集加载流程、以及在如何在配置文件中配置使用HuggingFace数据集功能进行介绍。
-
-#### 对接开源社区
-
-- 对接HuggingFace社区
-
-   如果需要使用HuggingFace社区中的数据集需要执行如下步骤：
-
-  1. 环境准备
-
-     环境变量 `HF_ENDPOINT` 可以控制开源社区huggingFace实际使用的远程仓库，未配置时默认为 `https://huggingFace.co` ，
-     针对国内环境，需要配置成镜像地址 ```export HF_ENDPOINT=https://hf-mirror.com``` 。
-
-  2. 安装依赖
-
-     ```shell
-     pip install datasets
-     ```
-
-- 对接魔乐开源社区
-
-   如果需要使用魔乐开源社区中的数据集需要执行如下步骤：
-
-  1. 环境准备
-
-     环境变量 `OPENMIND_HUB_ENDPOINT` 可以控制魔乐开源社区实际使用的远程仓库，
-     未配置时默认为 ```export OPENMIND_HUB_ENDPOINT=https://telecom.openmind.cn``` 。
-
-  2. 安装依赖
-
-     ```shell
-     git clone https://gitee.com/openmind-ai/openmind-hub.git
-     cd openmind-hub
-     pip install -e .
-     cd ..
-     git clone https://gitee.com/foundation-models/openmind-datasets.git
-     cd openmind-datasets
-     pip install -e .
-     cd ..
-     ```
-
-> 当环境安装了 openmind-datasets 三方件时，默认对接的是魔乐开源社区，如果这是想对接 HuggingFace，环境变量 `USE_OM` 可以控制具体对接哪个社区，默认值为 `ON` 为魔乐社区，修改为 `OFF` 对接 HuggingFace 社区
-
-#### 数据集加载流程
-
-![commondataloader.png](image/commondataloader.png)
-
-在线数据集加载与处理功能主要通过`CommonDataLoader`实现，其中数据加载部分可通过配置文件进行自定义配置，具体配置内容可参考[dataloader参数说明](#dataloader参数说明)，在线加载模块需要用户针对不同数据集进行自定义实现，如通过`AlpacaInstructDataHandler`类可实现对`alpaca`数据集进行预处理，具体实现过程可参考[自定义数据handler](#自定义数据handler)。
-
-#### dataloader参数说明
-
-在线数据集加载功能通过在配置文件中对`data_loader`进行配置来使能，下面是在线数据集加载相关配置的示例：
-
-```yaml
-train_dataset: &train_dataset
-  input_columns: &input_columns ["input_ids", "labels", "loss_mask", "position_ids", "attention_mask"]
-  construct_args_key: *input_columns
-  data_loader:
-    type: CommonDataLoader
-    load_func: 'load_dataset'
-    shuffle: False
-    split: "train"
-    path: "llm-wizard/alpaca-gpt4-data"
-    packing: pack
-    handler:
-      - type: AlpacaInstructDataHandler
-        tokenizer_name: llama2_7b
-        seq_length: 4096
-        prompt_key: "conversations"
-        output_columns: ["input_ids", "labels"]
-        is_dynamic: False
-      - type: PackingHandler
-        seq_length: 4096
-        output_columns: ["input_ids", "labels", "actual_seq_len"]
-    adaptor_config:
-      compress_mask: False
-    column_names: *input_columns
-```
-
-其中`data_loader`中相关参数说明如下：
-
-| 参数名            | 概述                                                                                                                                                   |  类型  |
-|----------------|------------------------------------------------------------------------------------------------------------------------------------------------------|:----:|
-| type           | 固定为`CommonDataLoader`，该模块支持HuggingFace以及魔乐开源社区的数据集加载功能                                                                                               | str  |
-| packing        | 使用`handler`处理数据集时packing配置项，可选值为`pack`或`truncate`                                                                                                    | str  |
-| load_func      | 加载数据集调用接口名，可选值为`load_dataset`或`load_from_disk`，读取通过`save_to_disk`接口保存的数据使用`load_from_disk`，其他场景使用`load_dataset`，默认值为`load_dataset`                   | str  |
-| path           | 在`load_func=load_dataset`时，该参数含义与[datasets.load_dataset](https://huggingface.co/docs/datasets/loading)中接口相同，在`load_func=load_from_disk`时，该参数为加载数据集路径 | str  |
-| data_files     | 在`load_func=load_dataset`时，该参数含义与[datasets.load_dataset](https://huggingface.co/docs/datasets/loading)中接口相同，在`load_func=load_from_disk`时不生效          | str  |
-| handler        | 可配置多个`handler`，按配置顺序对加载后的数据集进行预处理，`handler`配置说明参考[自定义数据handler](#自定义数据handler)中的handler参数说明                                                          | list |
-| adaptor_config | 在模型训练过程中数据集的相关配置，当前支持设置`compress_mask`，在设置`packing`时生效，开启后返回压缩后的数据掩码，默认为`False`                                                                      | dict |
-| shuffle        | 是否在读取数据集时开启随机采样                                                                                                                                      | bool |
-| column_names   | 设置数据集返回的列名，不指定时返回所有列                                                                                                                                 | list |
-| is_dynamic     | 设置数据集返回动态长度的数据，默认为`False`                                                                                                                            | bool |
-
-> 除了以上配置外，[datasets.load_dataset](https://huggingface.co/docs/datasets/loading)接口中的所有配置均已支持，且参数含义与功能相同。
-
-数据集在配置packing之后返回`actual_seq_len`数据列，其含义可参考[文档](https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0027.html)中`actual_seq_qlen`以及`actual_seq_kvlen`参数介绍。
-
-### 功能介绍
-
-#### 动态序列长度微调
-
-`CommonDataLoader`支持加载HuggingFace数据集进行动态shape微调，HuggingFace数据集加载分为在线加载和离线加载，下面以`alpaca`数据集为例介绍如何配置动态shape微调。
-
-- 在线加载
-
-  在线数据名称为`llm-wizard/alpaca-gpt4-data`，可在[HuggingFace官网](https://huggingface.co/datasets)搜索名称进行下载或使用在线名称进行加载；
-
-  在线加载配置文件示例：
-
-  ```yaml
-  train_dataset: &train_dataset
-    input_columns: &input_columns ["input_ids", "labels"]
-    dynamic_batch: True                    # 开启动态shape
-    divisor: 32                            # 配置divisor和remainder后，动态shape中seq_length会成为divisor的倍数以及remainder的和
-    remainder: 1
-    data_loader:
-      type: CommonDataLoader
-      shuffle: True
-      split: "train"                       # 在线数据集子集名称
-      path: "llm-wizard/alpaca-gpt4-data"  # 在线数据集名称
-      handler:
-        - type: AlpacaInstructDataHandler
-          tokenizer_name: llama2_7b
-          seq_length: 4096
-          prompt_key: "conversations"
-          output_columns: *input_columns
-          is_dynamic: True
-    seed: 0
-    num_parallel_workers: 8
-    python_multiprocessing: False
-    drop_remainder: True
-    repeat: 1
-    numa_enable: False
-    prefetch_size: 1
-  ```
-
-   1. `train_dataset`中参数说明可参考[文档](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/appendix/conf_files.html)；
-
-   2. `AlpacaInstructDataHandler`是针对`alpaca`数据集开发的在线处理脚本，如果使用其他数据集，用户需要参考[自定义数据handler](#自定义数据handler)完成自定义数据处理的功能实现。
-
-- 离线加载
-
-  离线加载需要准备好`alpaca`数据集中的json文件，离线配置与在线配置仅如下配置项不同。
-
-  ```yaml
-   train_dataset:
-     data_loader:
-       path: "json"                               # load_dataset接口加载文件格式
-       data_files: '/path/alpaca_gpt4_data.json'  # alpaca数据集文件路径
-   ```
-
-配置完数据集加载方式之后，还需要在模型配置中修改`is_dynamic=True`来开启模型动态shape训练。
-
-```yaml
-model_config:
-  is_dynamic: True
-```
-
-由于动态shape会存在算子编译缓存，当运行环境内存有限时，推荐配置如下环境变量来限制编译缓存的数量，避免出现内存不足的问题：
-
-```shell
-export ACLNN_CACHE_LIMIT=10
-export MS_DEV_RUNTIME_CONF="aclnn_cache_queue_length:64"
-```
-
-- `ACLNN_CACHE_LIMIT`参数说明参考[文档](https://www.hiascend.com/document/detail/zh/canncommercial/800/apiref/envvar/envref_07_0031.html)。
-- `MS_DEV_RUNTIME_CONF`是MindSpore中设置算子缓存序列长度的参数，其中64代表该序列的长度，默认为1024，可根据实际环境进行调整，数值设置过小可能会影响模型训练性能。
-
-完成以上所有配置后，即可参考具体使用的模型文档进行动态shape微调。
-
-#### 自定义数据handler
-
-用户可以使用自定义数据 handler 逻辑，对加载到的数据集进行各种数据预处理定制逻辑。
-
-- handler参数说明
-
-  | 参数名            | 概述                                                                      |    类型    |
-  |----------------|-------------------------------------------------------------------------|:--------:|
-  | type           | 自定义数据 handler 名称，自定义handler必须继承`BaseInstructDataHandler`                |   str    |
-  | tokenizer_name | 使用的 tokenizer 分词器名称                                                     |   str    |
-  | tokenizer      | tokenizer 相关配置参数, 可以是字典或者字符串，也可以直接配置`tokenizer`对象，优先级低于`tokenizer_name` | dict/str |
-  | seq_length     | 处理序列的最大长度，通常与模型的序列长度相同                                                  |   int    |
-  | output_columns | 数据预处理后返回的数据列名                                                           |   list   |
-  | prompt_key     | 增加 prompt 处理后数据的列名                                                      |   str    |
-
-- 开发样例一
-
-  自定义数据 handler 一般放在 `mindformers/dataset/handler` 目录下，自定义的需要继承抽象基类 ``BaseInstructDataHandler`` ，
-  需要实现 ``format_func`` 、 ``tokenize_func`` 两个方法，该方法是对加载到的每条数据进行预处理，可以参考 `alpaca_handler.py` 。
-
-  ```python
-  @MindFormerRegister.register(MindFormerModuleType.DATA_HANDLER)
-  class XXXInstructDataHandler(BaseInstructDataHandler):
-
-      def format_func(self, example):
-          # 自定义数据格式转换
-
-      def tokenize_func(self, example):
-          # 自定义tokenizer分词处理
-  ```
-
-  ``BaseInstructDataHandler`` 默认提供的实现了入口 ``handler`` 方法，用于遍历每条数据进行数据的预处理，
-  ``format_func`` 用于实现如何从原始数据中转换成所需要的数据格式，而 ``tokenize_func`` 方法用于把处理后的数据进行按自定义分词，
-  实例里的入参 ``example`` 为获取到的每一条样本数据。
-
-- 开发样例二
-
-  若用户想直接对于整个 dataset 进行数据处理，而不是每条数据分批处理的话，可以在自定义 handler 实现入口 ``handle`` 方法，得到的就是完整的 dataset，参考如下：
-
-  ```python
-      def handle(self, dataset):
-          """data handler"""
-          return dataset.rename_columns({"content":"prompt","summary":"answer"})
-  ```
-
-- alpaca 数据集示例
-
-  修改任务配置文件 [finetune_llama2_7b.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/configs/llama2/finetune_llama2_7b.yaml)。
-
-  修改如下参数：
-
-  ```yaml
-  train_dataset: &train_dataset
-    input_columns: &input_columns ["input_ids", "labels"]
-    data_loader:
-      type: CommonDataLoader
-      shuffle: True
-      split: "train"
-      path: "llm-wizard/alpaca-gpt4-data"
-      handler:
-        - type: AlpacaInstructDataHandler
-          tokenizer_name: llama2_7b
-          seq_length: 4096
-          prompt_key: "conversations"
-          output_columns: *input_columns
-    seed: 0
-    num_parallel_workers: 8
-    python_multiprocessing: False
-    drop_remainder: True
-    repeat: 1
-    numa_enable: False
-    prefetch_size: 1
-  ```
-
-  其余参数介绍可以参考 [配置文件说明](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/appendix/conf_files.html) 的 “模型训练配置” 和 “模型评估配置”。
-
-  自定义数据 handler：
-
-  ```python
-  @MindFormerRegister.register(MindFormerModuleType.DATA_HANDLER)
-  class AlpacaInstructDataHandler(BaseInstructDataHandler):
-
-      def format_func(self, example):
-          """format func"""
-          source = PROMPT_INPUT.format_map(example) \
-              if example.get(self.input_key, "") != "" \
-              else PROMPT_NO_INPUT.format_map(example)
-          target = example.get(self.output_key)
-          formatted_example = [
-              {
-                  "from": self.user_role,
-                  "value": source,
-              },
-              {
-                  "from": self.assistant_role,
-                  "value": target,
-              },
-          ]
-
-          return formatted_example
-
-      def tokenize_func(self, messages):
-          """tokenize func"""
-          conversation = self.gen_prompt(messages)
-          sep = self.template.sep + self.assistant_role + ": "
-          # Tokenize conversations
-          rounds = conversation.split(self.template.sep2)
-          ids = [self.tokenizer.bos_token_id]
-          mask = [1]
-          for _, rou in enumerate(rounds):
-              if rou == "":
-                  break
-              conv_out = self.tokenizer(rou)
-              ids.extend(conv_out['input_ids'][1:])
-              mask.extend(conv_out['attention_mask'][1:])
-          d = {'input_ids': ids, 'attention_mask': mask}
-          # pylint: disable=W0212
-          if not self.dynamic:
-              d = self.tokenizer._pad(d, max_length=self.seq_length + 1, padding_strategy='max_length')
-          input_id = d['input_ids'][:self.seq_length + 1]
-          target = np.array(d['input_ids'])
-          total_len = int(np.not_equal(target, self.tokenizer.pad_token_id).sum())
-          cur_len = 1
-          target[:cur_len] = self.ignore_token_id
-          for _, rou in enumerate(rounds):
-              if rou == "":
-                  break
-              parts = rou.split(sep)
-              if len(parts) != 2:
-                  break
-              parts[0] += sep
-              round_len = len(self.tokenizer(rou)['input_ids']) - 1
-              instruction_len = len(self.tokenizer(parts[0])['input_ids']) - 3
-
-              target[cur_len: cur_len + instruction_len] = self.ignore_token_id
-
-              cur_len += round_len
-          if self.dynamic:
-              return {
-                  "input_ids": input_id,
-                  "labels": target[:len(input_id)].tolist()
-              }
-          target[cur_len:] = self.ignore_token_id
-          if cur_len < self.seq_length + 1:
-              if cur_len != total_len:
-                  target[:] = self.ignore_token_id
-          else:
-              target = target[:self.seq_length + 1]
-          label = target.tolist()
-          return {
-              "input_ids": input_id,
-              "labels": label,
-          }
-  ```
-
-- ADGEN 数据集示例
-
-  修改任务配置文件 [run_glm3_6b_finetune_2k_800T_A2_64G.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/configs/glm3/run_glm3_6b_finetune_2k_800T_A2_64G.yaml)。
-
-  修改如下参数：
-
-  ```yaml
-  train_dataset: &train_dataset
-    data_loader:
-      type: CommonDataLoader
-      path: "HasturOfficial/adgen"
-      split: "train"
-      shuffle: True
-      handler:
-        - type: AdgenInstructDataHandler
-      phase: "train"
-      version: 3
-      column_names: ["prompt", "answer"]
-    tokenizer:
-      type: ChatGLM3Tokenizer
-      vocab_file: "/path/to/tokenizer.model"
-    input_columns: ["input_ids", "labels"]
-    max_source_length: 1024
-    max_target_length: 1023
-    ignore_pad_token_for_loss: True
-    num_parallel_workers: 8
-    python_multiprocessing: False
-    drop_remainder: True
-    batch_size: 8
-    repeat: 1
-    numa_enable: False
-    prefetch_size: 1
-    seed: 0
-  ```
-
-  其余参数介绍可以参考 [配置文件说明](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/appendix/conf_files.html) 的 “模型训练配置” 和 “模型评估配置”。
-
-  自定义 adgen_handler：
-
-  ```python
-  @MindFormerRegister.register(MindFormerModuleType.DATA_HANDLER)
-  class AdgenInstructDataHandler(BaseInstructDataHandler):
-      """agden data handler"""
-      def handle(self, dataset):
-          """data handler"""
-          return dataset.rename_columns({"content": "prompt", "summary": "answer"})
-  ```
-
-#### 数据集packing
-
-在`CommonDataLoader`中配置`PackingHandler`可以实现对数据进行packing处理，目前需要在前置处理中将原始数据处理为可输入模型的`input_ids`以及`labels`。
-
-- 参数说明
-
-  | 参数名            | 概述                                                                                                                         |  类型  |
-  |----------------|----------------------------------------------------------------------------------------------------------------------------|:----:|
-  | type           | 固定为`PackingHandler`，该模块支持对数据进行packing，在[dataloader](#dataloader参数说明)中配置`packing=pack`和`packing=truncate`时，分别对数据进行非截断和截断的拼接 | str  |
-  | seq_length     | packing处理后数据的最大序列长度                                                                                                        | int  |
-  | pad_token      | 当packing后样本未达到最大长度时，对`input_ids`填充使用的token id，默认值为0                                                                        | int  |
-  | ignore_token   | 当packing后样本未达到最大长度时，对`labels`填充使用的token id，默认值为-100                                                                        | int  |
-
-- packing示例
-
-  按照如下配置，对`alpaca`数据集进行预处理，即可实现在线packing。
-
-  ```yaml
-  train_dataset: &train_dataset
-    input_columns: &input_columns ["input_ids", "labels", "loss_mask", "position_ids", "attention_mask"]
-    construct_args_key: *input_columns
-    data_loader:
-      type: CommonDataLoader
-      shuffle: False
-      split: "train"
-      path: "llm-wizard/alpaca-gpt4-data"
-      packing: pack
-      handler:
-        - type: AlpacaInstructDataHandler
-          tokenizer_name: llama2_7b
-          seq_length: 4096
-          prompt_key: "conversations"
-          output_columns: ["input_ids", "labels"]
-        - type: PackingHandler
-          seq_length: 4096
-          output_columns: ["input_ids", "labels", "actual_seq_len"]
-      adaptor_config:
-        compress_mask: False
-    seed: 0
-    num_parallel_workers: 8
-    python_multiprocessing: False
-    drop_remainder: True
-    repeat: 1
-    numa_enable: False
-    prefetch_size: 1
-  ```
-
-使用上述配置文件处理`alpaca`数据集，会执行如下流程：
-
-1. 使用`AlpacaInstructDataHandler`以及`llama2_7b`的`tokenizer`将原始文本数据处理为`input_ids`和`labels`；
-2. 使用`PackingHandler`对处理后的`input_ids`和`labels`进行packing处理，得到拼接到`seq_length`长度的`input_ids`和`labels`, `actual_seq_len`拼接后样本中每个子样本的序列长度，在训练中会根据这个参数生成对应的数据掩码；
-3. 如果在`adaptor_config`中设置`compress_mask=False`表示训练时返回完整的数据掩码，否则返回`actual_seq_len`；
-
-#### 数据集离线处理
-
-`CommonDataLoader`除了支持数据集在线加载与处理，还支持离线处理数据集并进行保存。
-
-使用[datasets_preprocess.py](https://gitee.com/mindspore/mindformers/blob/r1.5.0/toolkit/data_preprocess/huggingface/datasets_preprocess.py)脚本可以离线处理 HuggingFace 数据集并进行保存。
-
-- 参数说明
-
-  | 参数名           | 概述                                                        | 类型  |
-  |---------------|-----------------------------------------------------------|:---:|
-  | config        | 离线处理数据的配置文件，与在线处理使用方法相同，具体参考[dataloader](#dataloader参数说明) | str |
-  | save_path     | 数据集经过预处理后的保存路径                                            | str |
-  | register_path | 模型API的注册路径，其中包含模型相关Python文件，通常是research目录下模型文件夹的路径        | int |
-
-- 使用示例
-
-  使用[数据集packing](#数据集packing)中提供的packing示例的配置文件即可，执行如下命令。
-
-  ```shell
-  python toolkit/data_preprocess/huggingface/datasets_preprocess.py \
-    --config data_process.yaml \
-    --save_path /path/processed_data
-  ```
-
-  如果需要加载保存后的数据集，需要对yaml进行如下修改：
-
-  ```yaml
-  train_dataset: &train_dataset
-    input_columns: &input_columns ["input_ids", "labels", "loss_mask", "position_ids", "attention_mask"]
-    construct_args_key: *input_columns
-    data_loader:
-      type: CommonDataLoader
-      shuffle: False
-      load_func: "load_from_disk"
-      path: "/path/processed_data"
-      adaptor_config:
-        compress_mask: False
-  ```
diff --git a/docs/mindformers/docs/source_zh_cn/function/distributed_parallel.md b/docs/mindformers/docs/source_zh_cn/function/distributed_parallel.md
deleted file mode 100644
index 1062a845f8c7b8fe0fc4c25282ab39796ec8c13a..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_zh_cn/function/distributed_parallel.md
+++ /dev/null
@@ -1,171 +0,0 @@
-# 分布式并行
-
-[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_zh_cn/function/distributed_parallel.md)
-
-## 并行模式与应用场景
-
-在大规模深度学习模型的训练中，尤其是面对庞大的数据集和复杂的模型架构时，单一设备的算力往往不足以应对这种需求。为了解决这个问题，MindSpore 提供了一套强大的并行策略配置，通过灵活的并行策略可以大幅提升训练效率，并降低计算资源的消耗。
-
-MindSpore 的并行模式包括数据并行、模型并行、流水线并行、序列并行等。这些模式可以单独使用，也可以结合在一起，形成复杂的混合并行策略，以应对不同的模型训练需求。通过合理配置这些并行策略，开发者可以有效利用多设备的计算资源，极大地提升训练效率。
-
-在实际应用中，不同的并行策略适用于不同的场景：
-
-- **数据并行**：适用于数据量大，模型相对简单的场景。
-- **模型并行**：适用于模型参数量巨大，单个设备无法容纳整个模型的场景。
-- **流水线并行**：适用于超大规模模型训练，需多设备共同计算的场景。
-- **序列并行**：适用于长序列输入的模型，减少单设备显存占用的场景。
-- **多副本并行**：通过执行序调度算法控制细粒度多分支的并行，提高计算与通信的相互掩盖。
-- **优化器并行**：将优化器的计算任务分散到多个设备上，以减少内存占用并提高训练效率。
-
-> 仓库中提供的 YAML 文件中并行策略配置已经优化，当前推荐用户使用半自动并行，以确保最佳性能和稳定性。
-
-## MindSpore Transformers 支持的并行特性
-
-MindSpore Transformers 支持多种并行特性，开发者可以利用这些特性来优化不同模型架构和硬件配置的训练。以下表格概述了这些并行特性，并提供了指向 MindSpore 文档中详细说明的链接。
-
-| **并行特性**                      | **描述**                                                                          |
-|-----------------------------------|---------------------------------------------------------------------------------|
-| **[数据并行](https://www.mindspore.cn/docs/zh-CN/r2.6.0/features/parallel/data_parallel.html)**                     | 将数据拆分到多个设备上，并在每个设备上同时进行训练。适用于数据量大且模型相对简单的任务。                                    |
-| **[模型并行](https://www.mindspore.cn/docs/zh-CN/r2.6.0/features/parallel/operator_parallel.html)**                     | 将模型参数分布到多个设备上，适合单个设备无法容纳整个模型的情况。                                                |
-| **[流水线并行](https://www.mindspore.cn/docs/zh-CN/r2.6.0/features/parallel/pipeline_parallel.html)**                   | 将模型分割成多个阶段，每个阶段在不同的设备上运行，以实现超大规模模型的高效训练。                                        |
-| **[优化器并行](https://www.mindspore.cn/docs/zh-CN/r2.6.0/features/parallel/optimizer_parallel.html)**                   | 将优化器计算分布到多个设备上，减少内存占用，提高训练效率。                                                   |
-| **序列并行**                     | 设计用于分摊模型并行无法切分的显存和计算，将Transformer层中的LayerNorm及Dropout的输入按照序列维度进行切分，减少单设备的显存压力。        |
-| **[长序列并行](#长序列并行)**  | 设计用于处理长序列输入的模型，对所有的input输入和所有的输出activation在sequence维度上进行切分，对于超长序列输入场景进一步减少显存占用。 |
-| **[多副本并行](https://www.mindspore.cn/docs/zh-CN/r2.6.0/features/parallel/pipeline_parallel.html#mindspore%E4%B8%AD%E7%9A%84interleaved-pipeline%E8%B0%83%E5%BA%A6)**                   | 用于在多个副本之间实现精细的并行控制，优化性能和资源利用率，适合大规格模型的高效训练。                                     |
-
-关于分布式并行参数的配置方法，参见 [MindSpore Transformers 配置说明](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/appendix/conf_files.html) 中的并行配置章节下的具体内容。
-
-## 并行特性介绍
-
-### 长序列并行
-
-从生成性AI到科研模型，长序列训练正在变得非常重要。现有的数据、张量和流水线等并行方法无法在序列维度进行切分。当序列维度（S）增长时，训练内存开销会以O（$S^2$）的速度增长。序列并行对所有的input输入和所有的输出activation在sequence维度上进行切分，用于减少输入序列长度的限制，有效地支持超长序列训练。
-
-#### Ring Attention序列并行
-
-长序列并行算法 Ring Attention 是当前业界长序列并行的代表性技术，用于解决长序列训练时的内存开销问题，同时实现计算与通信掩盖。Ring Attention 算法利用 Attention 的分块计算性质，当序列并行度为 N 时，将 Q，K，V 分别切分为 N 个子块，每张卡分别调用 Flash Attention 算子来计算本地 QKV 子块的 Attention 结果。由于每张卡只需要计算切分后 QKV 子块的 Attention，其内存占用大幅降低。Ring Attention 在做 FA 计算的同时采用环形通信向相邻卡收集和发送子块，实现计算与通信的最大化掩盖，保障了长序列并行的整体性能。
-
-MindSpore Transformers已支持配置Ring Attention序列并行方案，可通过以下配置项使能：
-
-```yaml
-model:
-  model_config:
-    ...
-    use_ring_attention: True
-    ...
-parallel_config:
-  ...
-  context_parallel: 2
-  ...
-```
-
-参数说明：
-
-- use_ring_attention：是否开启Ring Attention，默认为False。
-- context_parallel：序列并行切分数量，默认为1，根据用户需求配置。
-
-关于分布式并行参数的配置方法，参见 [MindSpore Transformers 配置说明](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/appendix/conf_files.html) 中的并行配置章节下的具体内容。
-
-#### Ulysses序列并行
-
-DeepSpeed提出的[Ulysses长序列并行方案](https://arxiv.org/abs/2309.14509)，将各个样本在seq维度切分给不同的计算卡；然后，在attention计算之前，对QKV执行all-to-all通信操作，以使每个计算卡接收完整的序列，使得各计算卡可以并行计算不同的注意力头；最后，在attention计算后使用另一个all-to-all来在注意力头上收集结果，同时重新在seq维度上进行切分。该方案可以有效扩展训练的序列长度，同时保持相对较低的通信量。
-
-MindSpore Transformers已支持配置Ulysses序列并行方案，可通过以下配置项使能：
-
-```yaml
-model:
-  model_config:
-    ...
-    use_attn_mask_compression: True #使能attention_mask压缩
-    ...
-parallel:
-  ...
-  enable_alltoall: True  # 允许插入alltoall算子
-  ...
-parallel_config:
-  ...
-  context_parallel: 2
-  context_parallel_algo: ulysses_cp  # 使能Ulysses序列并行
-  ...
-```
-
-参数说明：
-
-- use_attn_mask_compression：是否对Self-Attention中的Score矩阵进行掩码操作，默认为False，Ulysses序列并行方案下建议开启减少显存占用。
-- enable_alltoall：生成alltoall通信算子，默认为False，不启用时将会由allgather等其他算子组合完成等价替代，可参考MindSpore `set_auto_parallel_context`[接口文档](https://www.mindspore.cn/docs/zh-CN/r2.6.0/api_python/mindspore/mindspore.set_auto_parallel_context.html)；启用Ulysses方案时我们期望能够直接插入alltoall通信算子，因此将该配置项打开。
-- context_parallel_algo：设置为`ulysses_cp`开启Ulysses序列并行。
-
-关于分布式并行参数的配置方法，参见 [MindSpore Transformers 配置说明](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/appendix/conf_files.html) 中的并行配置章节下的具体内容。
-
-#### 混合序列并行
-
-目前Ulysses和Ring Attention序列并行方案均存在一定局限性，Ring Attention序列并行方案虽然理论上序列长度能够无限拓展，但通信和计算带宽利用率较低，在序列块大小较低时性能劣于Ulysses序列并行方案。而Ulysses在GQA、MQA场景下的序列并行受Head数量限制，序列长度的扩展有限。混合序列并行融合了Ulysses和Ring Attention序列并行方案，可以解决上述缺陷。
-
-MindSpore Transformers已支持配置混合序列并行方案，可通过以下配置项使能：
-
-```yaml
-parallel:
-  ...
-  enable_alltoall: True  # 允许插入alltoall算子
-  ...
-parallel_config:
-  ...
-  context_parallel: 16
-  context_parallel_algo: hybrid_cp  # 使能混合序列并行
-  ulysses_degree_in_cp: 8
-  ...
-```
-
-参数说明：
-
-- context_parallel_algo：设置为`hybrid_cp`时开启混合序列并行。
-- ulysses_degree_in_cp：Ulysses序列并行切分数量。
-
-关于分布式并行参数的配置方法，参见 [MindSpore Transformers 配置说明](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/appendix/conf_files.html) 中的并行配置章节下的具体内容。
-
-### 流水线并行
-
-#### 序列流水线并行（Seq-Pipe）
-
-模型输入按sequence维度进行切分，展开为多个序列块（Sequence Chunk）。在原有的1F1B和1F1B-Interleave上，将调度单位缩小为Sequence Chunk。`seq_split_num`为切分个数，当`seq_split_num`=1时，退化为1F1B或1F1B-Interleave。
-
-MindSpore Transformers已支持配置Seq-Pipe流水线并行方案，可通过以下配置项使能：
-
-```yaml
-# parallel context
-parallel:
-  pipeline_config:
-    pipeline_interleave: true
-    pipeline_scheduler: 'seqpipe'
-
-# parallel config
-parallel_config:
-  seq_split_num: 2
-```
-
-参数说明：
-
-- pipeline_scheduler：流水线的调度策略，目前mindformers只支持设置为`"seqpipe"`。
-- seq_split_num：输入按序列维度的切分个数。
-
-注意：
-
-- 目前仅支持Llama和DeepSeek系列模型。
-- 目前暂不支持使用Megatron的多源数据集进行训练的场景。
-
-关于分布式并行参数的配置方法，参见 [MindSpore Transformers配置说明](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/appendix/conf_files.html) 中的并行配置章节下的具体内容。
-
-## MindSpore Transformers 分布式并行应用实践
-
-在官网提供的[Llama3-70B微调配置](https://gitee.com/kong_de_shu/mindformers/blob/dev/research/llama3/llama3_70b/finetune_llama3_70b.yaml#)文件中，使用了多种分布式并行策略，以提升多机多卡环境中的训练效率。以下是该配置文件中涉及的主要并行策略和关键参数：
-
-- **数据并行**：未启用额外的数据并行（`data_parallel: 1`）。
-- **模型并行**：模型被切分成8个部分，在不同设备上计算（`model_parallel: 8`）。
-- **流水线并行**：模型分为8个流水线阶段，按顺序在不同设备上运行（`pipeline_stage: 8`）。
-- **序列并行**：开启序列并行（`use_seq_parallel: True`），将Transformer层中的LayerNorm及Dropout的输入按照序列维度进行切分，使各设备只需处理部分的LayerNorm和Dropout，减少模型显存占用。
-- **多副本并行**：通过执行序调度算法控制细粒度多分支的并行（`fine_grain_interleave: 2`），提高计算与通信的相互掩盖。
-- **优化器并行**：优化器计算分散到多个设备上，以减少内存占用（`enable_parallel_optimizer: True`）。
-
-> 注意：开启细粒度多副本并行的同时必须开启序列并行。
-
-通过以上配置，Llama3-70B的分布式训练在多机多卡环境中可以有效利用硬件资源，实现高效、稳定的模型训练。
diff --git a/docs/mindformers/docs/source_zh_cn/function/fine_grained_activations_swap.md b/docs/mindformers/docs/source_zh_cn/function/fine_grained_activations_swap.md
deleted file mode 100644
index bd41e08bbd2e4e65c319f7615d8d8cdd20cbcf05..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_zh_cn/function/fine_grained_activations_swap.md
+++ /dev/null
@@ -1,272 +0,0 @@
-# 细粒度激活值SWAP
-
-[![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_zh_cn/function/fine_grained_activations_swap.md)
-
-## 概述
-
-在传统大模型训练任务中，计算卡的显存资源常常成为训练瓶颈，采用更大规模的模型并行（model parallel, mp）和流水线并行（pipeline parallel, pp）切分策略，虽然能一定程度上缓解单张计算卡的显存压力，但需要更大规模的集群资源，且引入过多的通信会极大地降低模型的MFU。在集群资源有限的情况下，重计算是另一个缓解内存压力的有效手段，其通过放弃存储正向传播阶段的激活值，并在梯度反向回传时重新计算所需激活值，来降低激活值的显存占用，由于重计算需引入额外的计算开销，因此该方法同样会显著降低模型训练的MFU（Model FLOPs Utilization）。
-
-在此背景下，细粒度激活值SWAP技术可以提供第三种降低内存占用的有效手段，且拥有更大的性能优势。具体地，激活值SWAP技术在模型正向传播阶段，将需要长期存储的激活值卸载至host侧，并在反向传播阶段，使用该激活值时，提前将其预取回device侧。资源使用方面，激活值SWAP技术使用D2H/H2D带宽，可以在训练阶段与计算任务、D2D通信任务并发，实现对内存搬运开销的掩盖。
-
-细粒度激活值SWAP技术具备较高的使用灵活度。大模型训练的正向传播阶段，将产生数据量大小不同的若干激活值，用户可按需选择特定的激活值进行SWAP，且选择激活值的粒度为算子级。当模型类型或规格改变时，用户可灵活调整对应的SWAP策略，以追求最低的内存开销和最优的性能。
-
-## 使用说明
-
-### 约束场景
-
-- 仅支持静态图O0/O1模式
-- 支持Llama系稠密模型，后续演进支持MoE稀疏模型
-- Somas不支持异构，需在配置文件中设置
-
-  ```yaml
-  context:
-    memory_optimize_level=O0
-  ```
-
-- 未开启流水线并行时，需使能lazy_inline场景，设置环境变量
-
-  ```bash
-  ENABLE_LAZY_INLINE_NO_PIPELINE=1
-  ```
-
-- 仅支持Ascend后端
-
-### 接口说明
-
-细粒度激活值SWAP特性通过YAML配置`swap_config`字段使能，包括`swap`、`default_prefetch`、`layer_swap`、`op_swap`四个功能接口，用户可通过此接口灵活选择特定层或特定层的特定算子使能激活值SWAP功能。
-
-> 当前MindSpore框架将内存搬运与内存释放解耦。将激活值从device侧卸载至host侧时，即便数据已全部卸载，其在device侧占用的内存空间并未被立刻释放，而是需要再触发释放操作。内存释放操作触发前，会检测激活值卸载是否完成，若未完成，则进程会原地等待，直至激活值卸载完成。
-
-| 配置项 | 类型 | 说明 |
-|:--:|:--:|:---|
-| swap | Bool | 默认值False。当为False时，本特性的四个功能接口全部不生效；当为True时，激活值SWAP功能开启，并检查`layer_swap`与`op_swap`是否为None，若均为None，则启用默认的SWAP策略，该策略将对所有层中的`flash_attention`算子使能SWAP。若`layer_swap`与`op_swap`存在非None值，则屏蔽默认策略并按照`layer_swap`与`op_swap`的配置使能SWAP功能。 |
-| default_prefetch | Int | 默认值1。当swap=True、layer_None、op_swap=None时生效。`default_prefetch`用于调控默认SWAP策略的激活值内存释放时机和预取开始时机。当`default_prefetch`较大时，正向阶段释放内存时机较晚，激活值占用的device内存会在激活值卸载完成后被长期锁住，不被其他数据块复用，同时反向阶段开始将激活值从host侧拷贝至device侧的时机较早，申请相应内存空间的时间较早，内存压力未得到真正缓解；当`default_prefetch`较小时，正向阶段内存释放时机较早，存在等待激活值拷贝任务完成的空等时间，且反向阶段预取的开始时机较晚，若在使用激活值计算时仍未完成激活值预取，则也会引入等待时间，影响端到端性能。因此开放本接口，供用户调试内存释放时机与激活值预期时机，以达到最少的内存占用和最优的端到端性能。|
-| layer_swap | List | 默认值None。当为None时，本接口不生效；当为List类型时，本接口包含若干Dict类型的列表元素，每个Dict类型元素包含`backward_prefetch`与`layers`两个键，提供使能SWAP的预取时机（即开始搬回操作的时机）和对应的层索引。 |
-| op_swap | List | 默认值None。当为None时，本接口不生效；当为List类型时，本接口包含若干Dict类型的列表元素，每个Dict类型元素包含`op_name`、`backward_prefetch`与`layers`三个键，提供使能SWAP的预取时机和对应的算子名、层索引。 |
-
-### 混合重计算
-
-细粒度激活值SWAP与重计算存在耦合：
-
-1. 任意算子在同时使能重计算与SWAP时，重计算将生效，SWAP不生效。
-2. 对于任意使能了SWAP的算子，若使用其输出的算子使能了重计算，则该算子的SWAP不生效。
-3. 重计算的YAML配置接口只支持从前至后选择特定数量的层使能重计算，而不支持选择特定层或特定层的特定算子使能重计算，这意味着同时使用SWAP与重计算时，SWAP只能使能靠后的层或靠后层中的算子，无法获取SWAP特性的最大收益。因此当且仅当`swap=True`时，重计算接口功能将按下表调整。
-
-| 接口名称 | 原功能 | 开启SWAP后功能 |
-|:--:|:---|:---|
-| recompute | 确定各pipeline stage中使能重计算的层数 | 不感知pipeline stage，仅接受bool/list类型入参。当为bool类型时，所有层使能重计算；当为list类型时，列表元素为层索引，按索引选择特定层使能重计算 |
-| select_recompute | 确定各pipeline stage中特定算子使能重计算的层数 | 不感知pipeline stage，对于每个算子的键值对，仅接受bool/list类型入参。当为bool类型时，所有层使能重计算；当为list类型时，列表元素为层索引，按索引选择特定层使能重计算 |
-| select_comm_recompute | 确定各pipeline stage中通信算子使能重计算的层数 | 不感知pipeline stage，仅接受bool/list类型入参。当为bool类型时，所有层使能重计算；当为list类型时，列表元素为层索引，按索引选择特定层使能重计算 |
-
-## 使用示例
-
-本章节以 Llama2-7B 训练为例，演示细粒度激活值SWAP特性的使用。
-
-### 环境准备
-
-下载 MindSpore Transformers，并准备预训练数据集，如wikitext等。
-
-### 示例一：默认SWAP策略
-
-在YAML中修改补充重计算与SWAP配置，主要配置参数如下：
-
-```yaml
-context:
-  memory_optimize_level: "O0"
-model:
-  model_config:
-    num_layers: 4
-recompute_config:
-  recompute: False
-  select_recompute: False
-  select_comm_recompute: False
-swap_config:
-  swap: True
-  default_prefetch: 10
-```
-
-执行以下脚本启动单机八卡训练，启动脚本所在路径为MindSpore Transformers代码根目录，执行脚本需用户指定YAML文件路径（其中，machine_ip需要填写本地环境IP）：
-
-```bash
-export GLOG_v=1
-export MS_MEMORY_STATISTIC=1
-export ENABLE_LAZY_INLINE_NO_PIPELINE=1
-YAML_FILE=$1 # 用户指定YAML文件路径
-ROOT_PATH=`pwd`
-
-bash ./scripts/msrun_launcher.sh "run_mindformer.py \
-    --config ${ROOT_PATH}/${YAML_FILE} \
-    --run_mode train \
-    --use_parallel True" \
-    8 8 <machine_ip> 8118 0 output/msrun False 300
-```
-
-训练完毕后执行命令`cat output/msrun/worker_0.log | grep 'attention.flash_attention'`查看默认SWAP策略的执行情况：
-
-```text
--INFO - Set op_swap at layer 0: attention.flash_attention, value=10
--INFO - Set op_swap at layer 1: attention.flash_attention, value=10
--INFO - Set op_swap at layer 2: attention.flash_attention, value=10
--INFO - Set op_swap at layer 3: attention.flash_attention, value=10
-```
-
-默认SWAP策略执行成功。
-
-### 示例二：选择特定层使能SWAP
-
-在YAML中修改补充重计算与SWAP配置，主要配置参数如下：
-
-```yaml
-context:
-  memory_optimize_level: "O0"
-model:
-  model_config:
-    num_layers: 4
-recompute_config:
-  recompute: False
-  select_recompute: False
-  select_comm_recompute: False
-swap_config:
-  swap: True
-  layer_swap:
-    - backward_prefetch: 20
-      layers: [0,3]
-```
-
-执行以下脚本启动单机八卡训练，启动脚本所在路径为MindSpore Transformers代码根目录，执行脚本需用户指定YAML文件路径（其中，machine_ip需要填写本地环境IP）：
-
-```bash
-export GLOG_v=1
-export MS_MEMORY_STATISTIC=1
-export ENABLE_LAZY_INLINE_NO_PIPELINE=1
-YAML_FILE=$1 # 用户指定YAML文件路径
-ROOT_PATH=`pwd`
-
-bash ./scripts/msrun_launcher.sh "run_mindformer.py \
-    --config ${ROOT_PATH}/${YAML_FILE} \
-    --run_mode train \
-    --use_parallel True" \
-    8 8 <machine_ip> 8118 0 output/msrun False 300
-```
-
-训练完毕后执行命令`cat output/msrun/worker_0.log | grep 'Set layer swap at'`查看默认SWAP策略的执行情况：
-
-```text
--INFO - Set layer swap at layer 0 and value is: 20
--INFO - Set layer swap at layer 3 and value is: 20
-```
-
-选择特定层使能SWAP的策略执行成功。
-
-### 示例三：选择特定层的特定算子使能SWAP
-
-在YAML中修改补充重计算与SWAP配置，主要配置参数如下：
-
-```yaml
-context:
-  memory_optimize_level: "O0"
-model:
-  model_config:
-    num_layers: 4
-recompute_config:
-  recompute: False
-  select_recompute: False
-  select_comm_recompute: False
-swap_config:
-  swap: True
-  op_swap:
-    - op_name: 'attention'
-      backward_prefetch: 20
-      layers: [0,1,2]
-    - op_name: 'attention'
-      backward_prefetch: 10
-      layers: [3]
-    - op_name: 'feed_forward'
-      backward_prefetch: 15
-      layers: [1,2]
-```
-
-执行以下脚本启动单机八卡训练，启动脚本所在路径为MindSpore Transformers代码根目录，执行脚本需用户指定YAML文件路径（其中，machine_ip需要填写本地环境IP）：
-
-```bash
-export GLOG_v=1
-export MS_MEMORY_STATISTIC=1
-export ENABLE_LAZY_INLINE_NO_PIPELINE=1
-YAML_FILE=$1 # 用户指定YAML文件路径
-ROOT_PATH=`pwd`
-
-bash ./scripts/msrun_launcher.sh "run_mindformer.py \
-    --config ${ROOT_PATH}/${YAML_FILE} \
-    --run_mode train \
-    --use_parallel True" \
-    8 8 <machine_ip> 8118 0 output/msrun False 300
-```
-
-训练完毕后执行命令`cat output/msrun/worker_0.log | grep 'Set op_swap at layer'`查看默认SWAP策略的执行情况：
-
-```text
--INFO - Set op_swap at layer 0: .attention, value=20
--INFO - Set op_swap at layer 1: .attention, value=20, .feed_forward, value=15
--INFO - Set op_swap at layer 2: .attention, value=20, .feed_forward, value=15
--INFO - Set op_swap at layer 3: .attention, value=10
-```
-
-选择特定层的特定算子使能SWAP成功。
-
-### 示例四：细粒度激活值SWAP与重计算混用
-
-在YAML中修改补充重计算与SWAP配置，主要配置参数如下：
-
-```yaml
-context:
-  memory_optimize_level: "O0"
-model:
-  model_config:
-    num_layers: 4
-recompute_config:
-  recompute: False
-  select_recompute:
-    'feed_forward': [0,3]
-  select_comm_recompute: False
-swap_config:
-  swap: True
-  op_swap:
-    - op_name: 'attention'
-      backward_prefetch: 20
-      layers: [0,1,2]
-    - op_name: 'attention'
-      backward_prefetch: 10
-      layers: [3]
-    - op_name: 'feed_forward'
-      backward_prefetch: 15
-      layers: [1,2]
-```
-
-执行以下脚本启动单机八卡训练，启动脚本所在路径为MindSpore Transformers代码根目录，执行脚本需用户指定YAML文件路径（其中，machine_ip需要填写本地环境IP）：
-
-```bash
-export GLOG_v=1
-export MS_MEMORY_STATISTIC=1
-export ENABLE_LAZY_INLINE_NO_PIPELINE=1
-YAML_FILE=$1 # 用户指定YAML文件路径
-ROOT_PATH=`pwd`
-
-bash ./scripts/msrun_launcher.sh "run_mindformer.py \
-    --config ${ROOT_PATH}/${YAML_FILE} \
-    --run_mode train \
-    --use_parallel True" \
-    8 8 <machine_ip> 8118 0 output/msrun False 300
-```
-
-训练完毕后执行命令`cat output/msrun/worker_0.log | grep 'Set op_swap at layer' -C 1`查看默认SWAP策略的执行情况：
-
-```text
--INFO - Set select recompute at layer 0: feed_forward
--INFO - Set op_swap at layer 0: .attention, value=20
--INFO - Set op_swap at layer 1: .attention, value=20, .feed_forward, value=15
--INFO - Set op_swap at layer 2: .attention, value=20, .feed_forward, value=15
--INFO - Set select recompute at layer 3: feed_forward
--INFO - Set op_swap at layer 3: .attention, value=10
-```
-
-细粒度激活值SWAP与重计算混用成功。
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_zh_cn/function/high_availability.md b/docs/mindformers/docs/source_zh_cn/function/high_availability.md
deleted file mode 100644
index a754a0f4163f8acb5b9c2bb0e4819a05b031a6e9..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_zh_cn/function/high_availability.md
+++ /dev/null
@@ -1,237 +0,0 @@
-# 高可用特性
-
-[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_zh_cn/function/high_availability.md)
-
-## 概述
-
-MindSpore Transformers 高可用特性提供了如下三个功能：
-
-- **临终 CKPT 功能**：主要针对大模型训练过程中的故障恢复加速，该特性在训练过程中发生故障后，校验中间状态数据的完整性和一致性，生成一次临终 CheckPoint 数据，恢复训练时能够通过该 CheckPoint 数据恢复，减少故障造成的训练迭代损失。
-- **UCE 故障容错恢复功能**：主要是针对大模型训练过程中片上内存的 UCE 故障检测，并完成在线修复，达到 Step 级重计算。
-- **进程级重调度恢复功能**：训练发生异常后，不需要重新拉起整个集群，只需以节点为单位进行重启或替换，完成修复并继续训练。
-
-高可用特性目前只支持 MindSpore Ascend 后端的图模式；该特性同时需要支持Step级别恢复，因此配置数据下沉时只支持sink_size 为 1。
-
-高可用特性的基础是两张卡存在副本关系，这样当其中一张卡发生故障时，可从另外一张卡恢复，因此权重和优化器都会存在两份冗余，会占用更多的显存。为保证这种冗余关系，必须开启数据并行，保证有两张卡权重一致，同时如果开启了优化器并行，也必须确保存在两张卡的优化器状态一致。
-
-三个功能可同时开启，也可以单独开启。组合开启这三个功能时，依次生效的顺序是：UCE故障容错恢复 -> 进程级重调度恢复 -> 临终 CKPT ，如果其中一个功能可以恢复，就不会执行下一个功能。临终 CKPT 功能作为最后的保障，完成该功能后整个训练进程会退出，所以在另外两个功能开启时会默认开启。
-
-临终 CKPT 保存 Checkpoint 文件以及通过该文件进行续训均使用现有 MindSpore Transformers 的能力，在使用方式上一致，只是临终 CKPT 依赖于strategy文件，因此在训练和续训时均需要配置该文件夹。
-
-当异常触发临终的 CheckPoint 保存时，如果未开启去冗余保存，每个数据并行域只有一张卡保存了 CheckPoint，其余卡不会保存 CheckPoint；所以在恢复训练时，同样需要使能高可用特性才能恢复，否则其他卡无法找到可用的 CheckPoint，会报错退出。用户可通过计算分布式保存的 CheckPoint 数量是否为小于集群数量，来判断该 CheckPoint 是否由临终 CKPT 功能触发。
-
-## 使用说明
-
-高可用特性开关由环境变量使能，YAML 配置文件中不单独设置开关，但 YAML 文件需要能配置出两张卡的权重和优化器状态一致，详见本文档中的[副本关系配置](#副本关系配置)章节。
-
-高可用特性依赖用户安装 MindIO TFT SDK 包，详细请参考[在计算节点安装 MindIO TFT SDK](https://www.hiascend.com/document/detail/zh/mindx-dl/600/clusterscheduling/ref/mindiottp/mindiotft011.html)。
-
-### 环境变量配置
-
-```shell
-export MINDIO_FOR_MINDSPORE=1
-export MS_ENABLE_TFT="{TTP:1,UCE:1,ARF:1}"
-export MS_TFT_IP=127.0.0.1
-export MS_TFT_PORT=30051
-```
-
-- `MINDIO_FOR_MINDSPORE`：使能 MindIO TFT SDK 支持 MindSpore
-- `MS_ENABLE_TFT`：表示启用 TTP、UCE 和 ARF 功能，如果只想启用其中的某一个功能，则将对应的值设置为 1 即可。
-    - **TTP (Try To Persist)**：临终 CKPT 功能
-    - **UCE (Uncorrectable Memory Error)**：UCE 故障容错恢复功能
-    - **ARF (Air Refuelling)**：进程级重调度恢复功能
-    - 开启 UCE 或者 ARF 功能时，默认开启 TTP 功能
-
-- `MS_TFT_IP` 和 `MS_TFT_PORT` 分别表示 TFT Controller 的 IP 和端口号，无默认值，需要用户指定。如果由 MindSpore Transformers 启动 Controller，则配置用户集群中 rank0 节点的 IP 和端口号。如果用户自行启动 Controller，则配置 Controller 的 IP 和端口号。
-
-### YAML 配置
-
-YAML配置包含两部分：临终 CKPT 的保存及恢复配置和高可用的副本关系配置。
-
-#### 保存及恢复配置
-
-临终的 CheckPoint 保存和恢复能力分别用于初始训练和续训，这部分复用现有的 MindSpore Transformers 的配置，以下分别介绍初始训练和续训的配置。
-
-- **初始训练配置**
-
-    ```yaml
-    output_dir: './output' # 保存 CheckPoint 和 Strategy 的目录
-    load_checkpoint: ''    # 初次训练时配置为空
-    src_strategy_path_or_dir: '/output/strategy/'
-    only_save_strategy: False
-    resume_training: False  # 初次训练时配置为 False
-    run_mode: 'train'
-
-    callbacks:
-      - type: CheckpointMonitor
-        prefix: "llama2_13b"
-        save_checkpoint_steps: 100
-        integrated_save: False
-        async_save: False
-    ```
-
-- **续训配置**
-
-    ```yaml
-    output_dir: './output' # 保存 CheckPoint 和 Strategy 的目录
-    load_checkpoint: './output/checkpoint/'   # 续训时配置 CheckPoint 路径
-    src_strategy_path_or_dir: '/output/strategy/'
-    only_save_strategy: False
-    resume_training: True  # 续训时配置为 True
-    run_mode: 'train'
-
-    callbacks:
-      - type: CheckpointMonitor
-        prefix: "llama2_13b"
-        save_checkpoint_steps: 100
-        integrated_save: False
-        async_save: False
-    ```
-
-#### 副本关系配置
-
-高可用的三个功能的关键是配置出权重和优化器的副本冗余关系，配置的核心是数据并行域的维度大于 2，如果叠加优化器并行，需要同时保证优化器的副本数大于 2。所以配置分两类，开启优化器并行和不开启优化器并行。下面以 8 卡为例，介绍如何配置。
-
-- **不开启优化器并行**
-
-    数据并行度 dp 配置为 2 的倍数即可，这样就会存在两张卡的权重和优化器状态一致。
-
-    ```yaml
-    parallel:
-      enable_parallel_optimizer: False
-    parallel_config:
-      data_parallel: 2
-      model_parallel: 4
-      pipeline_stage: 1
-    ```
-
-- **开启优化器并行**
-
-    开优化器并行后必须要保证优化器的状态存在副本，配置的关键是 optimizer_weight_shard_size 为 2。此时优化器状态的副本数为 data_parallel/optimizer_weight_shard_size。因此，如果数据并行度配置为 2 时，是不存在优化器副本的，必须把数据并行度配置为 4；此时的副本数为 data_parallel/optimizer_weight_shard_size = 4/2 = 2。
-
-    ```yaml
-    parallel:
-      enable_parallel_optimizer: True
-      parallel_optimizer_config:
-        optimizer_weight_shard_size: 2
-    parallel_config:
-      data_parallel: 4
-      model_parallel: 2
-      pipeline_stage: 1
-    ```
-
-#### 示例
-
-本章节以 Llama2-13B 训练为例演示临终 CKPT 的使用。
-
-1. 先安装 MindSpore 和 MindIO
-2. 下载 MindSpore Transformers，修改 `configs/llama2/pretrain_llama2_13b_bf16.yaml` 配置文件，主要配置如下：
-
-    ```yaml
-    # runner config
-    runner_config:
-      epochs: 2
-      batch_size: 4
-      sink_mode: True
-      sink_size: 1
-
-    # ......
-
-    # parallel context config
-    parallel:
-      parallel_mode: 1 # 0-data parallel, 1-semi-auto parallel, 2-auto parallel, 3-hybrid parallel
-      gradients_mean: False
-      enable_alltoall: False
-      full_batch: True
-      search_mode: "sharding_propagation"
-      enable_parallel_optimizer: True
-      strategy_ckpt_save_file: "./ckpt_strategy.ckpt"
-      parallel_optimizer_config:
-        gradient_accumulation_shard: False
-        parallel_optimizer_threshold: 64
-        optimizer_weight_shard_size: 4
-
-    # ......
-
-    # default parallel of device num = 16 for Atlas 800T A2
-    parallel_config:
-      data_parallel: 8
-      model_parallel: 1
-      pipeline_stage: 1
-      use_seq_parallel: False
-      micro_batch_num: 1
-      vocab_emb_dp: True
-      gradient_aggregation_group: 4
-    ```
-
-    需要注意以下关键点：
-
-    - `sink_size: 1`： 临终 CKPT 和 UCE 故障容错恢复等特性不支持 `sink_size` 大于 1 的场景，因此这里配置为 1。
-    - `enable_parallel_optimizer: True`： 使能优化器并行。
-    - `optimizer_weight_shard_size: 4`： 优化器并行的切分大小为 4。
-    - `data_parallel: 8`: 数据并行配置为 8。
-
-    按照前面章节的说明，`data_parallel/optimizer_weight_shard_size` 的值为 `8 / 4 = 2`，大于 1，因此存在副本关系。
-3. 执行下面命令启动训练
-
-    ```bash
-    export MINDIO_FOR_MINDSPORE=1
-
-    export MS_ENABLE_TFT="{TTP:1,UCE:1,ARF:1}"
-    export MS_TFT_IP=127.0.0.1
-    export MS_TFT_PORT=30051
-
-    bash scripts/msrun_launcher.sh "run_mindformer.py \
-      --config configs/llama2/pretrain_llama2_13b_bf16.yaml \
-      --train_dataset_dir "/YourDataSetPath" \
-      --use_parallel True --run_mode train" 8
-    ```
-
-    注意：需要将 `/YourDataSetPath` 换成实际数据集的路径。
-4. 待训练执行若干个 step 之后，终止 worker 进程，触发临终 CKPT 保存
-
-    注意：通过上述启动方式， MindIO Controller 附着在 worker 0 进程上，此种情况下不能终止 worker 0，否则导致 MindIO Controller 退出，
-    无法触发临终 CKPT。但是通过 taskd 方式启动训练时，MindIO Controller 是个单独的进程，可以终止 worker 0 进程。
-5. 确认临终的 CheckPoint 生成
-
-    在整个训练进程结束后，通过日志确认最终生成的 CheckPoint 文件的合理性，具体操作如下：
-
-    1). 执行命令 `find output/checkpoint/ -name '*.ckpt'` 查找生成的 CheckPoint 文件：
-
-    ```text
-    $ find output/checkpoint/ -name '*.ckpt'
-    output/checkpoint/rank_2/llama2_13b_rank_2-5_1.ckpt
-    output/checkpoint/rank_3/llama2_13b_rank_3-5_1.ckpt
-    output/checkpoint/rank_0/llama2_13b_rank_0-5_1.ckpt
-    output/checkpoint/rank_5/llama2_13b_rank_5-5_1.ckpt
-    ```
-
-    2). 执行命令 `cat output/msrun_log/worker_0.log | grep 'Epoch:'` 查看已经训练的 step：
-
-    ```text
-    $ cat output/msrun_log/worker_0.log | grep 'Epoch:'
-    2025-04-07 15:34:27,308 - [mindformers/core/callback/callback.py:529] - INFO - { Epoch:[  1/  2], step:[    1/   19], loss: 10.649, per_step_time: 103328ms, lr: 0.0, overflow cond: False, loss_scale: 1.0, global_norm: [1 31049], train_throughput_per_npu: 2.896T
-    2025-04-07 15:34:29,173 - [mindformers/core/callback/callback.py:529] - INFO - { Epoch:[  1/  2], step:[    2/   19], loss: 10.633, per_step_time: 1752ms, lr: 1e-05, overflow cond: False, loss_scale: 1.0, global_norm: [1 508834], train_throughput_per_npu: 170.738T
-    2025-04-07 15:34:30,941 - [mindformers/core/callback/callback.py:529] - INFO - { Epoch:[  1/  2], step:[    3/   19], loss: 9.673, per_step_time: 1754ms, lr: 9.981987e-06, overflow cond: False, loss_scale: 1.0, global_norm [10.579812], train_throughput_per_npu: 170.523T
-    2025-04-07 15:34:32,704 - [mindformers/core/callback/callback.py:529] - INFO - { Epoch:[  1/  2], step:[    4/   19], loss: 9.287, per_step_time: 1756ms, lr: 9.928079e-06, overflow cond: False, loss_scale: 1.0, global_norm [21.932272], train_throughput_per_npu: 170.319T
-    2025-04-07 15:34:34,469 - [mindformers/core/callback/callback.py:529] - INFO - { Epoch:[  1/  2], step:[    5/   19], loss: 8.867, per_step_time: 1758ms, lr: 9.8386645e-06, overflow cond: False, loss_scale: 1.0, global_norm [16.986555], train_throughput_per_npu: 170.173T
-    ```
-
-    3). 执行命令 `cat output/msrun_log/worker_0.log | grep 'report group list:'` 查看日志中 MindIO 输出的副本关系：
-
-    ```text
-    $ cat output/msrun_log/worker_0.log | grep 'report group list:'
-    2025-04-07 15:34:27.363613 info 1879138 [TTP controller.cpp:1512] rank:4, report group list: [0, 4]
-    2025-04-07 15:34:27.385564 info 1879139 [TTP controller.cpp:1512] rank:7, report group list: [3, 7]
-    2025-04-07 15:34:27.393198 info 1879136 [TTP controller.cpp:1512] rank:6, report group list: [2, 6]
-    2025-04-07 15:34:27.393515 info 1879142 [TTP controller.cpp:1512] rank:1, report group list: [1, 5]
-    ```
-
-    从上面训练的 step 信息可以看出已经训练的 5 个 step，和 CheckPoint 的文件名 `llama2_13b_rank_2-5_1.ckpt` 中的 5 是一致的。
-
-    从日志中输出的副本关系 `[0, 4]`、`[3, 7]`、 `[2, 6]` 和 `[1, 5]` 得知：
-
-    - rank 0 和 rank 4 权重存在副本关系，临终的 Checkpoint 保存在 rank 0
-    - rank 3 和 rank 7 权重存在副本关系，临终的 Checkpoint 保存在 rank 3
-    - rank 2 和 rank 6 权重存在副本关系，临终的 Checkpoint 保存在 rank 2
-    - rank 1 和 rank 5 权重存在副本关系，由于 worker 1 终止，临终的 Checkpoint 保存在 rank 5
diff --git a/docs/mindformers/docs/source_zh_cn/function/image/TrainingStateMonitor_log.png b/docs/mindformers/docs/source_zh_cn/function/image/TrainingStateMonitor_log.png
deleted file mode 100644
index f98cbe0cd819576782d60eb731d62c298a692d71..0000000000000000000000000000000000000000
Binary files a/docs/mindformers/docs/source_zh_cn/function/image/TrainingStateMonitor_log.png and /dev/null differ
diff --git a/docs/mindformers/docs/source_zh_cn/function/image/adam_m_norm.png b/docs/mindformers/docs/source_zh_cn/function/image/adam_m_norm.png
deleted file mode 100644
index f8ece7816ed7b404e7f748a002e7d5b4bdfda00f..0000000000000000000000000000000000000000
Binary files a/docs/mindformers/docs/source_zh_cn/function/image/adam_m_norm.png and /dev/null differ
diff --git a/docs/mindformers/docs/source_zh_cn/function/image/commondataloader.png b/docs/mindformers/docs/source_zh_cn/function/image/commondataloader.png
deleted file mode 100644
index ba434972960609f6ddb16c2e30702d00e6717061..0000000000000000000000000000000000000000
Binary files a/docs/mindformers/docs/source_zh_cn/function/image/commondataloader.png and /dev/null differ
diff --git a/docs/mindformers/docs/source_zh_cn/function/image/local_loss&local_norm.png b/docs/mindformers/docs/source_zh_cn/function/image/local_loss&local_norm.png
deleted file mode 100644
index 3478ae69cf82cfde253adf375be364b743ae7df1..0000000000000000000000000000000000000000
Binary files a/docs/mindformers/docs/source_zh_cn/function/image/local_loss&local_norm.png and /dev/null differ
diff --git a/docs/mindformers/docs/source_zh_cn/function/image/tensorboard_scalar.png b/docs/mindformers/docs/source_zh_cn/function/image/tensorboard_scalar.png
deleted file mode 100644
index 143fc0812e918394dc4e55a5a1e1c14dd4b73dc7..0000000000000000000000000000000000000000
Binary files a/docs/mindformers/docs/source_zh_cn/function/image/tensorboard_scalar.png and /dev/null differ
diff --git a/docs/mindformers/docs/source_zh_cn/function/image/tensorboard_text.png b/docs/mindformers/docs/source_zh_cn/function/image/tensorboard_text.png
deleted file mode 100644
index 6857618c9cca67aac064a24d0122bdca3e7706b9..0000000000000000000000000000000000000000
Binary files a/docs/mindformers/docs/source_zh_cn/function/image/tensorboard_text.png and /dev/null differ
diff --git a/docs/mindformers/docs/source_zh_cn/function/image/wikitext_sample.png b/docs/mindformers/docs/source_zh_cn/function/image/wikitext_sample.png
deleted file mode 100644
index ea2a38a93b3ac3d3ad1d96e1a4c5afae8868291e..0000000000000000000000000000000000000000
Binary files a/docs/mindformers/docs/source_zh_cn/function/image/wikitext_sample.png and /dev/null differ
diff --git a/docs/mindformers/docs/source_zh_cn/function/monitor.md b/docs/mindformers/docs/source_zh_cn/function/monitor.md
deleted file mode 100644
index 32fb01411eb9f538a38967e32714563db6b07d5f..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_zh_cn/function/monitor.md
+++ /dev/null
@@ -1,223 +0,0 @@
-# 训练指标监控
-
-[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_zh_cn/function/monitor.md)
-
-MindSpore Transformers 支持 TensorBoard 作为可视化工具，用于监控和分析训练过程中的各种指标和信息。TensorBoard 是一个独立的可视化库，需要用户手动安装，它提供了一种交互式的方式来查看训练中的损失、精度、学习率、梯度分布等多种内容。用户在训练`yaml`文件中配置 TensorBoard 后，在大模型训练过程中会实时生成并更新事件文件，可以通过命令查看训练数据。
-
-## 配置说明
-
-在训练`yaml`文件中配置"monitor_config"、"tensorboard"和"callbacks"关键字，训练中会在配置的保存地址下保存tensorboard事件文件。
-配置示例如下：
-
-### `yaml`文件配置样例
-
-```yaml
-seed: 0
-output_dir: './output'
-
-monitor_config:
-    monitor_on: True
-    dump_path: './dump'
-    target: ['layers.0', 'layers.1'] # 只监控第一、二层的参数
-    invert: False
-    step_interval: 1
-    local_loss_format: ['log', 'tensorboard']
-    local_norm_format: ['log', 'tensorboard']
-    device_local_norm_format: ['log', 'tensorboard']
-    optimizer_state_format: null
-    weight_state_format: null
-    throughput_baseline: null
-    print_struct: False
-
-tensorboard:
-    tensorboard_dir: 'worker/tensorboard'
-    tensorboard_queue_size: 10
-    log_loss_scale_to_tensorboard: True
-    log_timers_to_tensorboard: True
-
-callbacks:
-    - type: MFLossMonitor
-      per_print_times: 1
-```
-
-| monitor_config字段参数名称                    | 说明                                                                                       | 类型            |
-|-----------------------------------------|------------------------------------------------------------------------------------------|---------------|
-| monitor_config.monitor_on               | 设置是否开启监控。默认为`False`，此时以下所有参数不生效                                                          | bool          |
-| monitor_config.dump_path                | 设置训练过程中`local_norm`、`device_local_norm`、`local_loss`指标文件的保存路径。未设置或设置为`null`时取默认值'./dump' | str           |
-| monitor_config.target                   | 设置指标`优化器状态`和`local_norm`所监控的的目标参数的名称（片段），可为正则表达式。未设置或设置为`null`时取默认值['.*']，即指定所有参数        | list[str]     |
-| monitor_config.invert                   | 设置反选`monitor_config.target`所指定的参数。默认为`False`                                             | bool          |
-| monitor_config.step_interval            | 设置记录指标的频率。默认为1，即每个step记录一次                                                               | int           |
-| monitor_config.local_loss_format        | 设置指标`local_loss`的记录形式                                                                    | str或list[str] |
-| monitor_config.local_norm_format        | 设置指标`local_norm`的记录形式                                                                    | str或list[str] |
-| monitor_config.device_local_norm_format | 设置指标`device_local_norm`的记录形式                                                             | str或list[str] |
-| monitor_config.optimizer_state_format   | 设置指标`优化器状态`的记录形式                                                                         | str或list[str] |
-| monitor_config.weight_state_format      | 设置指标`权重L2-norm`的记录形式                                                                     | str或list[str] |
-| monitor_config.throughput_baseline      | 设置指标`吞吐量线性度`的基线值，需要为正数。会同时写入到 Tensorboard 和日志。未设置时默认为`null`，表示不监控该指标                     | int或float     |
-| monitor_config.print_struct             | 设置是否打印模型的全部可训练参数名。若为`True`，则会在第一个step开始时打印所有可训练参数的名称，并在step结束后退出训练。默认为`False`            | bool          |
-
-上述 xxx_format 形式的参数的可选值为字符串'tensorboard'和'log'（分别表示写入 Tensorboard 和写入日志），或由两者组成的列表，或`null`。未设置时均默认为`null`，表示不监控对应指标。
-
-**注意**，当前开启对`优化器状态`和`权重L2 norm`指标的监控时会极大增加训练进程的耗时，请根据需要谨慎选择；`monitor_config.dump_path`路径下对应的"rank_x"目录将被清空，请确保所设置路径下没有需要保留的文件。
-
-| tensoraboard字段参数名称                        | 说明                                                    | 类型   |
-|-------------------------------------------|-------------------------------------------------------|------|
-| tensorboard.tensorboard_dir               | 设置 TensorBoard 事件文件的保存路径                              | str  |
-| tensorboard.tensorboard_queue_size        | 设置采集队列的最大缓存值，超过该值便会写入事件文件，默认值为10                      | int  |
-| tensorboard.log_loss_scale_to_tensorboard | 设置是否将 loss scale 信息记录到事件文件，默认为`False`                 | bool |
-| tensorboard.log_timers_to_tensorboard     | 设置是否将计时器信息记录到事件文件，计时器信息包含当前训练步骤（或迭代）的时长以及吞吐量，默认为`False` | bool |
-
-`tensorboard.tensorboard_dir`可通过环境变量'MA_SUMMARY_LOG_DIR'来指定，此时若`tensorboard`未配置，则会自动生成一个默认的`tensorboard`配置。
-需要注意的是，在没有`tensorboard`配置时，`monitor_config`在xxx_format中设置的"tensorboard"将被替换为"log"，即从写入tensorboard事件文件改为在日志中进行相应信息的打印。
-
-## 查看训练数据
-
-进行上述配置后，训练期间将会在路径 `./worker/tensorboard/rank_{id}` 下保存每张卡的事件文件，其中 `{id}` 为每张卡对应的的rank号。事件文件以 `events.*` 命名。文件中包含 `scalars` 和 `text` 数据，其中 `scalars` 为训练过程中关键指标的标量，如学习率、损失等； `text` 为训练任务所有配置的文本数据，如并行配置、数据集配置等。此外，根据具体配置，部分指标将在日志中进行展示。
-
-使用以下命令可以启动 Tensorboard Web 可视化服务：
-
-```bash
-tensorboard --logdir=./worker/tensorboard/ --host=0.0.0.0 --port=6006
-```
-
-|参数名称   | 说明                                                     |
-|--------|--------------------------------------------------------|
-| logdir | TensorBoard保存事件文件的文件夹路径                                |
-| host   | 默认是 127.0.0.1，表示只允许本机访问；设置为 0.0.0.0 可以允许外部设备访问，请注意信息安全 |
-| port   | 设置服务监听的端口，默认是 6006                                               |
-
-输入样例中的命令后会显示：
-
-```shell
-TensorBoard 2.18.0 at http://0.0.0.0:6006/ (Press CTRL+C to quit)
-```
-
-其中 `2.18.0` 表示 TensorBoard 当前安装的版本号（推荐版本为 `2.18.0` ）， `0.0.0.0` 和 `6006` 分别对应输入的 `--host` 和 `--port` ，之后可以在本地PC的浏览器中访问 `服务器公共ip:端口号` 查看可视化页面，例如服务器的公共IP为 `192.168.1.1` ，则访问 `192.168.1.1:6006` 。
-
-### 指标可视化说明
-
-回调函数`MFLossMonitor`和`TrainingStateMonitor`将分别对不同的标量指标进行监控。其中`TrainingStateMonitor`不需要用户在配置文件中设置，会根据monitor_config自动进行添加。
-
-#### MFLossMonitor监控指标
-
-`MFLossMonitor`监控的指标名称和说明如下：
-
-| 标量名         | 说明                                                  |
-|---------------|-----------------------------------------------------|
-| learning-rate | 学习率                                                 |
-| batch-size    | 批次大小                                                |
-| loss          | 损失                                                  |
-| loss-scale    | 损失缩放因子，记录需要设置`log_loss_scale_to_tensorboard`为`True` |
-| grad-norm     | 梯度范数                                                |
-| iteration-time | 训练迭代所需的时间，记录需要设置`log_timers_to_tensorboard`为`True`  |
-| throughput    | 数据吞吐量，记录需要设置`log_timers_to_tensorboard`为`True`      |
-| model-flops-throughput-per-npu | 模型算力吞吐量，单位为TFLOPS/npu（万亿次浮点数运算每秒每卡）                                       |
-| B-samples-per-day    | 集群数据吞吐量，单位为B samples/day（十亿样本每天），记录需要设置`log_timers_to_tensorboard`为`True` |
-
-在 Tensorboard 的 SCALARS 页面中，上述指标（假设名为 `scalar_name`）除了最后两个，其他都存在 `scalar_name` 和 `scalar_name-vs-samples` 两个下拉标签页。其中 `scalar_name` 下展示了该标量随训练迭代步数进行变化的折线图； `scalar_name-vs-samples` 下展示了该标量随样本数进行变化的折线图。如下图所示为学习率`learning-rate`的曲线图示例：
-
-![/tensorboard_scalar](./image/tensorboard_scalar.png)
-
-#### TrainingStateMonitor监控指标
-
-`TrainingStateMonitor`监控的指标名称和说明如下：
-
-| 标量名                  | 说明                                            |
-|----------------------|-----------------------------------------------|
-| local_norm           | 单卡上各参数的梯度范数，记录需要设置`local_norm_format`非null    |
-| device_local_norm    | 单卡上的总梯度范数，记录需要设置`device_local_norm_format`非null    |
-| local_loss           | 单卡上的局部损失，记录需要设置`local_loss_format`非null            |
-| adam_m_norm          | 优化器一阶矩估计各参数的范数，记录需要设置`optimizer_state_format`非null |
-| adam_v_norm          | 优化器二阶矩估计各参数的范数，记录需要设置`optimizer_state_format`非null |
-| weight_norm          | 权重L2范数，记录需要设置`weight_state_format`非null            |
-| throughput_linearity | 数据吞吐线性度，记录需要设置`throughput_baseline`非null           |
-
-根据具体的设置，上述指标将在 Tensorboard 或日志中进行展示，如下：
-
-**日志效果示例**
-
-![/TrainingStateMonitor_log](./image/TrainingStateMonitor_log.png)
-
-**tensorboard可视化效果示例**
-
-adam_m_norm
-
-![/adam_m_norm](./image/adam_m_norm.png)
-
-local_loss与local_norm
-
-![/local_loss&local_norm](./image/local_loss&local_norm.png)
-
-### 文本数据可视化说明
-
-在 TEXT 页面中，每个训练配置存在一个标签页，其中记录了该配置的值。如下图所示：
-
-![/tensorboard_text](./image/tensorboard_text.png)
-
-所有配置名和说明如下：
-
-| 配置名                        | 说明                                                           |
-|----------------------------|--------------------------------------------------------------|
-| seed                       | 随机种子                                                         |
-| output_dir                 | 保存checkpoint、strategy的路径                                     |
-| run_mode                   | 运行模式                                                         |
-| use_parallel               | 是否开启并行                                                       |
-| resume_training            | 是否开启断点续训功能                                                   |
-| ignore_data_skip           | 是否忽略断点续训时跳过数据的机制，而从头开始读取数据集。只在 `resume_training` 值为`True`时记录 |
-| data_skip_steps            | 数据集跳过步数。只在 `ignore_data_skip` 被记录且值为`False`时记录               |
-| load_checkpoint            | 加载权重的模型名或权重路径                                                |
-| load_ckpt_format           | 加载权重的文件格式。只在 `load_checkpoint` 值不为空时记录                       |
-| auto_trans_ckpt            | 是否开启自动在线权重切分或转换。只在 `load_checkpoint` 值不为空时记录                 |
-| transform_process_num      | 转换checkpoint的进程数。只在 `auto_trans_ckpt` 被记录且值为`True`时记录        |
-| src_strategy_path_or_dir   | 源权重分布式策略文件路径。只在 `auto_trans_ckpt` 被记录且值为`True`时记录            |
-| load_ckpt_async            | 是否异步加载权重。只在 `load_checkpoint` 值不为空时记录                        |
-| only_save_strategy         | 任务是否仅保存分布式策略文件                                               |
-| profile                    | 是否开启性能分析工具                                                   |
-| profile_communication      | 是否在多设备训练中收集通信性能数据。只在 `profile` 值为`True`时记录                   |
-| profile_level              | 采集性能数据级别。只在 `profile` 值为`True`时记录                            |
-| profile_memory             | 是否收集Tensor内存数据。只在 `profile` 值为`True`时记录                      |
-| profile_start_step         | 性能分析开始的step。只在 `profile` 值为`True`时记录                         |
-| profile_stop_step          | 性能分析结束的step。只在 `profile` 值为`True`时记录                         |
-| profile_rank_ids           | 指定rank ids开启profiling。只在 `profile` 值为`True`时记录               |
-| profile_pipeline           | 是否按流水线并行每个stage的其中一张卡开启profiling。只在 `profile` 值为`True`时记录    |
-| init_start_profile         | 是否在Profiler初始化的时候开启数据采集。只在 `profile` 值为`True`时记录             |
-| layer_decay                | 层衰减系数                                                        |
-| layer_scale                | 是否启用层缩放                                                      |
-| lr_scale                   | 是否开启学习率缩放                                                    |
-| lr_scale_factor            | 学习率缩放系数。只在 `lr_scale` 值为`True`时记录                            |
-| micro_batch_interleave_num | batch_size的拆分份数，多副本并行开关                                      |
-| remote_save_url            | 使用AICC训练作业时，目标桶的回传文件夹路径                                      |
-| callbacks                  | 回调函数配置                                                       |
-| context                    | 环境配置                                                         |
-| data_size                  | 数据集长度                                                        |
-| device_num                 | 设备数量（卡数）                                                     |
-| do_eval                    | 是否开启边训练边评估                                                   |
-| eval_callbacks             | 评估回调函数配置。只在 `do_eval` 值为`True`时记录                            |
-| eval_step_interval         | 评估step间隔。只在 `do_eval` 值为`True`时记录                            |
-| eval_epoch_interval        | 评估epoch间隔。只在 `do_eval` 值为`True`时记录                           |
-| eval_dataset               | 评估数据集配置。只在 `do_eval` 值为`True`时记录                             |
-| eval_dataset_task          | 评估任务配置。只在 `do_eval` 值为`True`时记录                              |
-| lr_schedule                | 学习率                                                          |
-| metric                     | 评估函数                                                         |
-| model                      | 模型配置                                                         |
-| moe_config                 | 混合专家配置                                                       |
-| optimizer                  | 优化器                                                          |
-| parallel_config            | 并行策略配置                                                       |
-| parallel                   | 自动并行配置                                                       |
-| recompute_config           | 重计算配置                                                        |
-| remove_redundancy          | checkpoint保存时是否去除冗余                                          |
-| runner_config              | 运行配置                                                         |
-| runner_wrapper             | wrapper配置                                                    |
-| monitor_config             | 训练指标监控配置                                                     |
-| tensorboard                | TensorBoard配置                                                |
-| train_dataset_task         | 训练任务配置                                                       |
-| train_dataset              | 训练数据集配置                                                      |
-| trainer                    | 训练流程配置                                                       |
-| swap_config                | 细粒度激活值SWAP配置                                                 |
-
-> 上述训练配置来源于：
->
-> 1. 用户在训练启动命令 `run_mindformer.py` 中传入的配置参数；
-> 2. 用户在训练配置文件 `yaml` 中设置的配置参数；
-> 3. 训练默认的配置参数。
->
-> 可配置的所有参数请参考[配置文件说明](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/appendix/conf_files.html)。
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_zh_cn/function/resume_training.md b/docs/mindformers/docs/source_zh_cn/function/resume_training.md
deleted file mode 100644
index ffc84a79d26ddb6da0fb7994ef13207002c4dafe..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_zh_cn/function/resume_training.md
+++ /dev/null
@@ -1,278 +0,0 @@
-# 权重保存与断点续训
-
-[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_zh_cn/function/resume_training.md)
-
-## 权重保存
-
-### 概述
-
-在深度学习模型的训练过程中，保存模型的权重是至关重要的一步。权重保存功能使得我们能够在训练的任意阶段存储模型的参数，以便用户在训练中断或完成后进行恢复、继续训练、评估或部署。通过保存权重。同时还可以在不同环境下复现实验结果。
-
-### 目录结构
-
-在训练过程中，MindSpore Transformers会在输出目录中生成两个权重保存文件夹：`checkpoint` 和 `checkpoint_network`。
-
-| 文件夹                | 描述                                                  |
-|--------------------|-----------------------------------------------------|
-| checkpoint         | 保存权重、优化器状态、step和epoch于ckpt文件中，用于**断点恢复训练**。         |
-| checkpoint_network | 仅保存权重参数于ckpt文件中，适用于作为**预训练权重**的加载或**推理评估**，不支持断点续训。 |
-
-#### `checkpoint`目录结构
-
-`checkpoint`文件夹中的权重文件按如下格式保存：
-
-```text
-checkpoint
-  ├── rank_0
-    ├── meta.json
-    └── {prefix}-{epoch}_{step}.ckpt
-  ...
-  └── rank_x
-    ├── meta.json
-    └── {prefix}-{epoch}_{step}.ckpt
-```
-
-| 文件                           | 描述                                                                                                                                                                                                                                                 |
-|------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| meta.json                    | 记录最后保存的权重的`epoch`、`step`和权重名，每个rank进程独立维护一个`meta.json`文件。                                                                                                                                                                                          |
-| {prefix}-{epoch}_{step}.ckpt | 保存的权重文件，`prefix`包含rank_id信息，格式为`{prefix}-{epoch}_{step}.ckpt`。如果前缀相同的文件已经存在，系统会自动递增后缀。开启数据下沉时，`epoch`位置计算方式为 $\frac{CurrentTotalStepNumber}{SinkSize} = \frac{((CurrentEpoch-1)*StepsPerEpoch+CurrentStepInEpoch)}{SinkSize}$，`step`固定为`sink_size` |
-
-#### `checkpoint_network`目录结构
-
-```text
-checkpoint
-  ├── rank_0
-    └── {prefix}-{epoch}_{step}.ckpt
-  ...
-  └── rank_x
-    └── {prefix}-{epoch}_{step}.ckpt
-```
-
-| 文件                           | 描述                                                                                                    |
-|------------------------------|-------------------------------------------------------------------------------------------------------|
-| {prefix}-{epoch}_{step}.ckpt | 保存的权重文件，`prefix`包含rank_id信息，格式为`{prefix}-{epoch}_{step}.ckpt`。如果前缀相同的文件已经存在，系统会自动递增后缀。开启数据下沉时的命名规则同上。 |
-
-### 配置与使用
-
-#### YAML参数配置
-
-用户可通过修改配置文件来控制权重保存的行为。以下是主要参数：
-
-| 参数                    | 描述                                |
-|-----------------------|-----------------------------------|
-| save_checkpoint_steps | 每多少步保存一次权重，不设置时为不保存。              |
-| keep_checkpoint_max   | 最多同时保存多少个权重文件，达到上限后会在保存权重时删除最旧的权重文件。 |
-
-用户可修改`yaml`配置文件中`CheckpointMonitor`下的字段来控制权重保存行为。例如：
-
-```yaml
-callbacks:
-  ...
-  - type: CheckpointMonitor
-    prefix: "llama2_7b"
-    save_checkpoint_steps: 500
-    keep_checkpoint_max: 3
-  ...
-```
-
-上例中表示每隔500步保存一次权重，最多同时存储三个权重。
-
-## 断点续训
-
-### 概述
-
-MindSpore Transformers支持**step级断点续训**功能，允许在训练中保存模型的checkpoint，并在训练中断后，加载保存的checkpoint恢复之前的状态继续训练。这一特性在处理大规模训练任务时尤为重要，能够有效减少因意外中断导致的时间和资源浪费。此外，在数据集不变，但`global batch size`改变的断点续训场景下，例如更换集群或修改配置时，本工具还支持续训步数与数据跳过步数自动同比例缩放。
-
-### 配置与使用
-
-#### YAML参数配置
-
-用户可通过修改配置文件来控制断点续训的行为。以下是主要参数，其他参数可参考CheckpointMonitor介绍：
-
-| 参数            | 描述                                                                                                           |
-| --------------- |--------------------------------------------------------------------------------------------------------------|
-| load_checkpoint | 断点续训时加载的权重路径。路径可以是文件夹路径（用于加载分布式权重），也可以是具体权重文件的路径。默认为空字符串，即不加载权重（断点续训时必填）                                     |
-| resume_training | 断点续训开关，可设置为`True`或指定特定的权重文件名。为`True`时，系统会自动从上次中断处恢复训练。默认为`False`                                             |
-| load_ckpt_async | 是否将加载权重与模型编译的操作并行执行，不支持在线自动切分权重场景（auto_trans_ckpt=True），该场景下不生效。默认为False串行执行。<br />为`True`时，并行执行，减少总体拉起续训的耗时 |
-
-根据传入参数不同，可分为如下四种情况：
-
-| load_checkpoint | resume_training | 功能描述                                                                                                                                                                    | 是否为推荐使用方式 |
-|-----------------|-----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------|
-| 权重文件路径          | True            | 基于load_checkpoint指代的权重续训                                                                                                                                                | √         |
-| 权重文件路径          | 权重文件名           | resume_training指代的文件名无效，基于load_checkpoint指代的权重续训                                                                                                                        | ×         |
-| 权重文件夹路径         | True            | **场景1："单机"或"多机+共享目录"或"ModelArts"**<br />① 基于meta.json记录的权重续训，支持故障恢复。<br />② 若任一rank文件夹下缺少meta.json，所有rank基于最后时间戳的权重续训。<br />**场景2："多机+非共享目录"**<br />所有rank基于最后时间戳的权重续训。 | √         |
-| 权重文件夹路径         | 权重文件名           | 基于resume_training指代的权重续训                                                                                                                                                | √         |
-
-此外，用户还可通过增改配置文件的如下参数来使用相关功能。
-
-| 参数               | 描述                                                                                                          |
-|------------------|-------------------------------------------------------------------------------------------------------------|
-| ignore_data_skip | 是否忽略断点续训时跳过数据的机制，而从头开始读取数据集。用于续训时数据集更换的场景。设置为`True`时不会跳过数据集，默认为`False`。                                     |
-| data_skip_steps  | 数据集跳过步数。用于更换数据集续训后再次断开续训或`global batch size`改变的场景，须手动设置此参数来配置新数据集跳过步数，如`global batch size`改变，需向下整除缩放系数后再传入。 |
-
-#### 故障恢复机制
-
-当`resume_training`设置为`True`时，系统会自动基于`meta.json`记录的权重进行续训。如果某个rank的权重文件缺失或损坏，系统会回退到上一个可用的权重进行恢复。
-
-> 分布式环境中，断点续训要求所有节点的权重在同一共享目录下。用户可通过环境变量`SHARED_PATHS`来设置共享路径。
-
-### 分布式训练示例
-
-以下示例演示了如何在单卡和多卡环境中启动断点续训。示例基于`llama2_7b`
-模型，相关配置文件[configs/llama2/pretrain_llama2_7b.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/configs/llama2/pretrain_llama2_7b.yaml)。
-
-#### 完整训练
-
-1. 修改`configs/llama2/pretrain_llama2_7b.yaml`：
-
-   根据需要设置并行配置：
-
-   ```yaml
-   parallel_config:
-     data_parallel: 1
-     model_parallel: 2
-     pipeline_stage: 2
-     micro_batch_num: 2
-   ```
-
-   根据需要设置模型权重保存配置：
-
-   ```yaml
-   callbacks:
-     ...
-     - type: CheckpointMonitor
-       prefix: "llama2_7b"
-       save_checkpoint_steps: 10
-       keep_checkpoint_max: 3
-       integrated_save: False
-       async_save: False
-     ...
-   ```
-
-2. 准备数据集，此处以[wikitext2](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/llama2.md#%E6%95%B0%E6%8D%AE%E5%8F%8A%E6%9D%83%E9%87%8D%E5%87%86%E5%A4%87)为例，启动4卡分布式训练：
-
-   ```shell
-   bash scripts/msrun_launcher.sh "run_mindformer.py \
-       --config configs/llama2/pretrain_llama2_7b.yaml \
-       --train_dataset /path/to/wikitext2-llama2.mindrecord \
-       --run_mode train \
-       --use_parallel True" 4
-   ```
-
-   在第四次保存完毕后，结束进程，此时`checkpoint`下的`rank_0`文件夹结构为：
-
-   ```text
-   checkpoint/rank_0
-     ├── llama2_7b_rank_0-10_2.ckpt
-     ├── llama2_7b_rank_0-15_2.ckpt
-     ├── llama2_7b_rank_0-20_2.ckpt
-     └── meta.json
-   ```
-
-#### 断点续训
-
-1. 修改配置，指定断点续训权重文件：
-
-   ```yaml
-   load_checkpoint: './output/checkpoint'
-   resume_training: True
-   ```
-
-2. 启动断点续训：
-
-   ```shell
-   bash scripts/msrun_launcher.sh "run_mindformer.py \
-       --config configs/llama2/pretrain_llama2_7b.yaml \
-       --train_dataset /path/to/wikitext2-llama2.mindrecord \
-       --run_mode train \
-       --use_parallel True" 4
-   ```
-
-   如若初始步数从第`42`步开始，则断点续训成功。由于最后保存的权重包含了第`40`步的信息，`sink_size`默认为`2`，即每两步打印一次信息，因此初始步数为`42`。
-
-#### 切换数据集断点续训
-
-在切换数据集并进行断点续训时，有三种主要场景，每个场景需要针对配置文件进行不同的修改。下面逐一介绍每种情况，并详细说明在哪些场景下需要对基本断点续训流程的哪一步进行修改，以及如何修改具体配置来达成预期效果。
-
-**场景一：全新数据集，继续训练（无需跳过已训练的步数）**
-
-在这种场景中，当切换到一个全新数据集时，模型的训练将从新数据集的开头开始，而无需跳过任何步数。对于这种情况，配置文件需要设置为**忽略之前的数据进度**，让模型在新数据集上从头训练。
-
-- **配置修改**：需要在基本断点续训流程的第一步的基础上对`ignore_data_skip`进行设置。将`ignore_data_skip`设置为`True`，表示不跳过数据集。
-
-   ```yaml
-   load_checkpoint: './output/checkpoint'
-   resume_training: True
-   ignore_data_skip: True
-   ```
-
-- **预期效果**：模型将在新数据集上从头训练，而不会跳过任何步数。
-
-**场景二：在新数据集上断点续训，并跳过部分已训练的步数**
-
-在这种情况下，模型在新数据集上已经训练了一部分（例如断开前已训练了`2`步），期望从上次中断的地方继续训练。此时，必须手动指定需要跳过的步数。
-
-- **配置修改**：需要在基本断点续训流程的第一步的基础上对`ignore_data_skip`和`data_skip_steps`进行设置。将`ignore_data_skip`设置为`False`，并且通过`data_skip_steps`指定要跳过的已训练步数（例如，跳过`2`步）。
-
-   ```yaml
-   load_checkpoint: './output/checkpoint'
-   resume_training: True
-   ignore_data_skip: False
-   data_skip_steps: 2
-   ```
-
-- **预期效果**：模型将跳过新数据集的前`2`步，从第`3`步开始继续训练。
-
-**场景三：在新数据集上断点续训时，`global batch size`发生变化**
-
-如果在新数据集上继续断点续训时，`global batch size`改变了（例如，变为原先的 2 倍），手动指定需跳过的步数时需要对已训练的步数进行缩放。具体来说，跳过的步数需要根据缩放系数向下整除。例如，如果`global batch size`变为原先的`2`倍，需跳过的步数则相应减少一半。
-
-- **配置修改**：需要在场景二的基础上对`data_skip_steps`进行调整。将`data_skip_steps`设置为缩放后的步数。例如，`global batch size`变为原先的`2`倍，需跳过的步数变为`1`（向下整除）。
-
-   ```yaml
-   load_checkpoint: './output/checkpoint'
-   resume_training: True
-   ignore_data_skip: False
-   data_skip_steps: 1
-   ```
-
-- **预期效果**：模型将根据新的`global batch size`调整跳过的步数，并从正确的地方继续训练。
-
-#### 故障恢复示例
-
-当部分权重文件缺失时，系统会自动基于上一个可用的权重进行恢复。
-
-1. 删除`rank_3`下的`llama2_7b_rank_0-20_2.ckpt`文件。删除后文件夹结构应为：
-
-   ```text
-   checkpoint/rank_3
-     ├── llama2_7b_rank_0-10_2.ckpt
-     ├── llama2_7b_rank_0-15_2.ckpt
-     └── meta.json
-   ```
-
-2. 修改配置，启用故障恢复：
-
-   ```yaml
-   load_checkpoint: './output/checkpoint'
-   resume_training: True
-   ```
-
-3. 启动分布式训练：
-
-   ```shell
-   bash scripts/msrun_launcher.sh "run_mindformer.py \
-       --config configs/llama2/pretrain_llama2_7b.yaml \
-       --train_dataset /path/to/wikitext2-llama2.mindrecord \
-       --run_mode train \
-       --use_parallel True" 4
-   ```
-
-   如若初始步数从第`32`步开始，则断点续训成功。由于`rank_3`下的包含了第`40`步的信息的权重被删除，因此自动使用上一次保存的权重，即包含第
-   `30`步信息的权重。由于`sink_size`默认为`2`，即每两步打印一次信息，因此初始步数为`32`。
-
-### 注意事项
-
-- **数据下沉模式**：分布式断点续训必须开启数据下沉模式，配置`sink_mode=True`。
-- **权重文件检查**：确保断点续训加载的权重为训练中断时的权重，而不是整个训练过程最后保存的权重，否则会报错。
diff --git a/docs/mindformers/docs/source_zh_cn/function/safetensors.md b/docs/mindformers/docs/source_zh_cn/function/safetensors.md
deleted file mode 100644
index d234aa37f0a44be62eb93e083ba1fef0c3770071..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_zh_cn/function/safetensors.md
+++ /dev/null
@@ -1,243 +0,0 @@
-# Safetensors权重
-
-[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_zh_cn/function/safetensors.md)
-
-## 概述
-
-Safetensors 是 Huggingface 推出的一种可靠、易移植的机器学习模型存储格式，用于安全地存储Tensor，而且存储速度较快（零拷贝）。本文主要介绍MindSpore Transformers如何支持该文件格式的保存与加载，帮助用户更好更快地使用权重。
-
-## Safetensors权重示例
-
-Safetensors文件主要分为两种类型：完整权重文件和分布式权重文件。以下是它们的获取方式及对应的文件示例。
-
-### 完整权重
-
-Safetensors完整权重可通过以下两种方式获取：
-
-1. 直接从Huggingface上下载。
-2. 通过MindSpore Transformers分布式训练后，通过[合并脚本](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/function/transform_weight.html#safetensors%E6%9D%83%E9%87%8D%E7%A6%BB%E7%BA%BF%E5%90%88%E5%B9%B6)生成完整权重。
-
-Huggingface Safetensors示例目录结构：
-
-```text
-qwen2_7b
- └── hf_unified_safetenosrs
-        ├── model-00001-of-00004.safetensors
-        ├── model-00002-of-00004.safetensors
-        ├── model-00003-of-00004.safetensors
-        ├── model-00004-of-00004.safetensors
-        └── model.safetensors.index.json        # Huggingface权重参数和文件的存储关系映射json文件
-```
-
-MindSpore Safetensors示例目录结构：
-
-```text
-qwen2_7b
- └── ms_unified_safetenosrs
-        ├── model-00001-of-00004.safetensors
-        ├── model-00002-of-00004.safetensors
-        ├── model-00003-of-00004.safetensors
-        ├── model-00004-of-00004.safetensors
-        ├── hyper_param.safetensors            # 训练任务记录的超参文件
-        └── param_name_map.json                # MindSpore权重参数和文件的存储关系映射json文件
-```
-
-### 分布式权重
-
-Safetensors分布式权重可通过以下两种方式获取：
-
-1. 通过MindSpore Transformers分布式训练生成。
-2. 通过[格式转换脚本](https://www.mindspore.cn/docs/zh-CN/r2.6.0/api_python/mindspore/mindspore.ckpt_to_safetensors.html)，将原有分布式ckpt权重转换为Safetensors格式。
-
-分布式Safetensors示例目录结构：
-
-```text
-qwen2_7b
- └── distributed_safetenosrs
-        ├── rank_0
-            └── qwen2_7b_rank_0.safetensors
-        ├── rank_1
-            └── qwen2_7b_rank_1.safetensors
-        ...
-        └── rank_x
-            └── qwen2_7b_rank_x.safetensors
-```
-
-## 配置说明
-
-加载相关配置：
-
-| 参数名称              | 说明                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
-| ------------------- |-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| load_checkpoint     | 预加载权重的文件夹路径。<br> - 如果是完整权重，填写切片/单个权重文件所在文件夹路径。<br/>注：支持Huggingface safetensor权重加载（当前仅支持Llama系列模型）。在线加载过程中，会保存一份转换后的MindSpore safetensor权重文件至`/output/ms_safetensors下`。<br> - 如果是分布式权重，需按照`model_dir/rank_x/xxx.safetensor`格式存放，文件夹路径填写为`model_dir`。                                                                                                                                                                                                             |
-| load_ckpt_format | 加载的模型权重的格式，可选`ckpt`、`safetensors`，默认为`ckpt`。<br/>加载权重为`safetensors`格式时，需配套修改此配置为`safetensors`。                                                                                                                                                                                                                                                                                                                                                                  |
-| auto_trans_ckpt | 是否开启在线切分功能。<br/>- 如果加载权重是完整权重：<br/>a. `use_parallel: True`时，判断为分布式加载，需同步设置`auto_trans_ckpt: True`，开启在线切分功能。<br/>b. `use_parallel: False`时，判断为单卡加载，需同步设置`auto_trans_ckpt: False`，关闭在线切分功能。<br/>- 如果加载权重是分布式权重：<br/>a. 不改变原有切分策略，需设置`auto_trans_ckpt: False`，直接按原先切分策略直接加载。<br/>b. 改变原有切分策略，需设置`auto_trans_ckpt: True` 并配置`src_strategy_path_or_dir`为原有切分策略文件路径。<br/>任务拉起时，会将权重在线合并为完整权重，并依据配置文件中设定的并行策略进行切分与加载。在线合并的完整权重会保存在当前目录`/output/unified_checkpoint`文件下。 |
-| remove_redundancy | 加载的权重是否为去冗余后的权重，默认为`False`。                                                                                                                                                                                                                                                                                                                                                                                                                                 |
-
-保存相关配置：
-
-| 参数名称                    | 说明                                                         |
-| :-------------------------- | ------------------------------------------------------------ |
-| callbacks.checkpoint_format | 保存的模型权重的格式，默认值为`ckpt`。可选`ckpt`，`safetensors`。 |
-| callbacks.remove_redundancy | 保存权重时是否开启去冗余保存功能，默认为`False`。仅支持`safetensors格式`。 |
-
-## 使用示例
-
-### 预训练任务示例
-
-以Llama2-7B为例，修改配置项[pretrain_llama2_7b.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/configs/llama2/pretrain_llama2_7b.yaml)确认权重保存格式：
-
-```yaml
-callbacks:
-  - type: CheckpointMonitor
-    checkpoint_format: safetensors                  # 保存权重文件格式
-    remove_redundancy: True                         # 保存权重时开启去冗余
-```
-
-完成后执行命令：
-
-```shell
-bash scripts/msrun_launcher.sh "run_mindformer.py \
- --config configs/llama2/pretrain_llama2_7b.yaml \
- --train_dataset_dir /{path}/wiki4096.mindrecord \
- --use_parallel True \
- --run_mode train" 8
-```
-
-任务执行完成后，在mindformers/output目录下，会生成checkpoint文件夹，同时模型文件会保存在该文件夹下。
-
-更多详情请参考：[预训练介绍](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/usage/pre_training.html)
-
-### 微调任务示例
-
-若使用完整权重多卡在线微调，以Qwen2-7B模型为例，修改配置项[finetune_qwen2_7b.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/qwen2/qwen2_7b/finetune_qwen2_7b.yaml)：
-
-```yaml
-# 修改后的配置
-load_checkpoint: '/qwen2_7b/hf_unified_safetenosrs' # 加载权重文件路径
-load_ckpt_format: 'safetensors'                     # 加载权重文件格式
-auto_trans_ckpt: True                               # 完整权重时需打开此配置项，开启在线切分功能
-parallel_config:                                    # 配置目标分布式策略
-  data_parallel: 1
-  model_parallel: 2
-  pipeline_stage: 1
-callbacks:
-  - type: CheckpointMonitor
-    checkpoint_format: safetensors                  # 保存权重文件格式
-```
-
-若使用分布式权重多卡在线微调，以Qwen2-7B模型为例，修改配置项[finetune_qwen2_7b.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/qwen2/qwen2_7b/finetune_qwen2_7b.yaml)：
-
-```yaml
-# 修改后的配置
-load_checkpoint: '/qwen2_7b/distributed_safetenosrs' # 加载权重文件路径
-load_ckpt_format: 'safetensors'                      # 加载权重文件格式
-parallel_config:                                     # 配置目标分布式策略
-  data_parallel: 1
-  model_parallel: 2
-  pipeline_stage: 1
-callbacks:
-  - type: CheckpointMonitor
-    checkpoint_format: safetensors                  # 保存权重文件格式
-```
-
-完成后执行命令：
-
-```shell
-bash scripts/msrun_launcher.sh "run_mindformer.py \
- --config research/qwen2/qwen2_7b/finetune_qwen2_7b.yaml \
- --train_dataset_dir /{path}/alpaca-data.mindrecord \
- --register_path research/qwen2 \
- --use_parallel True \
- --run_mode finetune" 2
-```
-
-任务执行完成后，在mindformers/output目录下，会生成checkpoint文件夹，同时模型文件会保存在该文件夹下。
-
-更多详情请参考：[SFT微调介绍](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/usage/sft_tuning.html)
-
-### 推理任务示例
-
-若使用完整权重多卡在线推理，以Qwen2-7B模型为例，修改配置项[predict_qwen2_7b_instruct.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/qwen2/qwen2_7b/predict_qwen2_7b_instruct.yaml)：
-
-```yaml
-# 修改后的配置
-load_checkpoint: '/qwen2_7b/hf_unified_safetenosrs' # 加载权重文件路径
-load_ckpt_format: 'safetensors'                     # 加载权重文件格式
-auto_trans_ckpt: True                               # 完整权重时需打开此配置项，开启在线切分功能
-parallel_config:
-  data_parallel: 1
-  model_parallel: 2
-  pipeline_stage: 1
-```
-
-若使用分布式权重多卡在线推理，以Qwen2-7B模型为例，修改配置项[predict_qwen2_7b_instruct.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/qwen2/qwen2_7b/predict_qwen2_7b_instruct.yaml)：
-
-```yaml
-# 修改后的配置
-load_checkpoint: '/qwen2_7b/distributed_safetenosrs' # 加载权重文件路径
-load_ckpt_format: 'safetensors'                      # 加载权重文件格式
-parallel_config:
-  data_parallel: 1
-  model_parallel: 2
-  pipeline_stage: 1
-```
-
-完成后执行命令：
-
-```shell
-bash scripts/msrun_launcher.sh "python run_mindformer.py \
---config research/qwen2/qwen2_7b/predict_qwen2_7b_instruct.yaml \
---run_mode predict \
---use_parallel True \
---register_path research/qwen2 \
---predict_data 'I love Beijing, because'" \
-2
-```
-
-执行以上单卡推理和多卡推理命令的结果如下：
-
-```text
-'text_generation_text': [I love Beijing, because it is a city with a long history and culture.......]
-```
-
-更多详情请参考：[推理介绍](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/usage/inference.html)
-
-### 断点续训任务示例
-
-MindSpore Transformers支持step级断点续训功能，允许在训练中保存模型的checkpoint，并在训练中断后，加载保存的checkpoint恢复之前的状态继续训练。
-
-若使用分布式权重多卡续训且不改变切分策略，修改配置项后启动原训练任务：
-
-```yaml
-# 修改后的配置
-load_checkpoint: '/output/checkpoint'                # 加载源分布式权重文件路径
-load_ckpt_format: 'safetensors'                      # 加载权重文件格式
-resume_training: True                                # 断点续训功能开关
-callbacks:
-  - type: CheckpointMonitor
-    checkpoint_format: safetensors                   # 保存权重文件格式
-```
-
-若分布式权重多卡续训且改变切分策略，需额外传入源切分策略文件路径，修改配置项后启动原训练任务：
-
-```yaml
-# 修改后的配置
-load_checkpoint: '/output/checkpoint'               # 加载源分布式权重文件路径
-src_strategy_path_or_dir: '/output/src_strategy'    # 加载源策略文件，用于合并源分布式权重为完整权重
-load_ckpt_format: 'safetensors'                     # 加载权重文件格式
-auto_trans_ckpt: True                               # 开启在线切分功能
-resume_training: True                               # 断点续训功能开关
-parallel_config:                                    # 配置目标分布式策略
-  data_parallel: 2
-  model_parallel: 4
-  pipeline_stage: 1
-callbacks:
-  - type: CheckpointMonitor
-    checkpoint_format: safetensors                  # 保存权重文件格式
-```
-
-大集群规模场景下，避免在线合并过程耗时过长占用训练资源，推荐将原分布式权重文件离线[合并完整权重](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/function/transform_weight.html#safetensors%E6%9D%83%E9%87%8D%E7%A6%BB%E7%BA%BF%E5%90%88%E5%B9%B6)后传入，无需传入源切分策略文件路径。
-
-更多详情请参考：[断点续训介绍](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/function/resume_training.html)。
-
diff --git a/docs/mindformers/docs/source_zh_cn/function/transform_weight.md b/docs/mindformers/docs/source_zh_cn/function/transform_weight.md
deleted file mode 100644
index bcebdd9a86c96c3800cb665e3739cc32cdbcd1f6..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_zh_cn/function/transform_weight.md
+++ /dev/null
@@ -1,421 +0,0 @@
-# 分布式权重切分与合并
-
-[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_zh_cn/function/transform_weight.md)
-
-## 概述
-
-在当前的分布式训练和推理环境中，当预训练权重与分布式策略不匹配时，需要对预训练权重进行转换，以适应相应的分布式策略。为满足不同场景下的权重转换需求，MindSpore Transformers提供了一套权重转换工具。该工具支持单卡权重切分为多卡权重、多卡权重之间的转换、多卡权重合并为单卡权重。用户可根据具体需求选择[自动转换](#自动转换)或[离线转换](#离线转换)，帮助模型在不同分布式场景之间快速切换。
-
-此外，MindSpore Transformers还支持[LoRA权重的合并](#lora权重合并)，方便用户部署使用LoRA微调后的模型。
-
-## 自动转换
-
-模型加载权重时，自动转换功能可以自动检测权重与当前模型分布式切分策略之间的匹配情况，如果不匹配，自动进行权重转换，无需用户手动干预。
-
-### 参数说明
-
-**自动权重转换**相关`yaml`文件参数说明如下：
-
-| 参数名称              | 说明                                                                                                                                                                                                                                                                                                                                                                                                                                 |
-| ------------------- |------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| load_checkpoint     | 预加载权重的绝对路径或文件夹路径。<br> - 如果是完整权重，则填写绝对路径；<br> - 如果是分布式权重，则填写文件夹路径，分布式权重须按照`model_dir/rank_x/xxx.ckpt`格式存放，文件夹路径填写为`model_dir`。<br>**如果rank_x文件夹下存在多个ckpt，将会使用文件名默认排序最后的ckpt文件用于转换。**                                                                                                                                                                                                                                                |
-| src_strategy_path_or_dir        | 预加载权重对应的[分布式策略文件](#生成分布式策略)路径。<br> - 如果预加载权重是完整权重，则**不填写**；<br> - 如果预加载权重是分布式权重，且预加载权重保存时使用了流水线并行，则填写**合并的策略文件路径**或**分布式策略文件夹路径**；<br> - 如果预加载权重是分布式权重，且预加载权重保存时未使用流水线并行，则填写任一**ckpt_strategy_rank_x.ckpt**路径；                                                                                                                                                                                                                     |
-| auto_trans_ckpt     | 权重自动转换开关，为True开启，默认False。                                                                                                                                                                                                                                                                                                                                                                                                          |
-| transform_process_num | 权重自动转换使用的进程数，默认为1。<br> - 如果transform_process_num = 1，使用**单进程转换**，转换时只有rank_0负责权重转换，其他进程等待rank_0转换结束；<br> - 如果transform_process_num > 1，使用**多进程转换**，比如8卡任务，transform_process_num=2时，转换时rank_0负责rank_0/1/2/3切片权重的转换，rank_4负责rank_4/5/6/7切片权重的转换，其他进程等待rank_0/4转换结束；<br>**注意**：<br> 1. transform_process_num越大，转换时间越短，**转换所占用的host内存越大**；当出现host侧内存不足时，需要减少transform_process_num。<br> 2. transform_process_num必须能够整除NPU卡数，且最大不得超过NPU卡数。 |
-| transform_by_rank   | 是否使用mindspore.transform_checkpoint_by_rank接口做权重转换。<br> - transform_process_num > 1时，自动设置为`True`；<br> - transform_process_num = 1时，如果目标权重为分布式权重，则循环调用mindspore.transform_checkpoint_by_rank串行转换每一个rank切片权重。<br>- transform_process_num = 1时，如果目标权重为完整权重，则自动设置为`False`，使用mindspore.transform_checkpoints接口做权重转换；                                                                                                                     |
-
-### 不同场景下yaml配置说明
-
-#### 单卡权重切分为多卡权重
-
-```yaml
-# load_checkpoint: 设置为预训练权重文件路径
-load_checkpoint: "/worker/llama3_8b/llama3_8b.ckpt"
-
-# auto_trans_ckpt: 开启自动转换
-auto_trans_ckpt: True
-```
-
-#### 多卡权重之间的转换
-
-```yaml
-# load_checkpoint: 设置为多卡权重文件夹路径
-load_checkpoint: "/worker/checkpoint/llama3-8b-2layer-dp2mp2pp2"
-
-# src_strategy_path_or_dir: 设置为分布式策略文件路径
-src_strategy_path_or_dir: "/worker/checkpoint/llama3-8b-2layer-dp2mp2pp2/strategy/merged_ckpt_strategy.ckpt"
-
-# auto_trans_ckpt: 开启自动转换
-auto_trans_ckpt: True
-```
-
-#### 多卡权重合并为单卡权重
-
-```yaml
-# load_checkpoint: 设置为多卡权重文件夹路径
-load_checkpoint: "/worker/checkpoint/llama3-8b-2layer-dp1mp2pp2"
-
-# src_strategy_path_or_dir: 设置为分布式策略文件路径
-src_strategy_path_or_dir: "/worker/checkpoint/llama3-8b-2layer-dp1mp2pp2/strategy/merged_ckpt_strategy.ckpt"
-
-# auto_trans_ckpt: 开启自动转换
-auto_trans_ckpt: True
-
-# use_parallel: 设置为False
-use_parallel: False
-```
-
-#### 开启多进程转换（可选）
-
-```yaml
-# transform_process_num: 设置参与转换的进程数量
-transform_process_num: 2
-```
-
-### 注意事项
-
-- **多进程转换**：配置`transform_process_num`参数以开启多进程转换，但需注意内存占用。如果发生内存溢出，建议降低进程数量。
-
-- **自动权重转换**：开启自动转换后，系统将删除`output`目录下的旧`strategy`和`transformed_checkpoint`文件夹，并保存当前任务的输出结果。建议在转换任务结束后，将`strategy`和`transformed_checkpoint`文件夹移动到自定义目录，以避免后续操作中被误删。
-
-- **分布式策略文件保存**：分布式策略文件将保存在`output/strategy`文件夹下。如果开启了**流水线并行**，系统会自动合并所有的`ckpt_strategy_rank_x.ckpt`文件，生成`merged_ckpt_strategy.ckpt`。如果未开启流水线并行，则不会进行合并操作。
-
-## 离线转换
-
-离线转换功能旨在满足用户手动转换权重的需求。通过离线转换，用户可以在独立的环境中进行模型权重的转换操作。离线转换支持多种权重转换场景，包括单卡权重切分为多卡权重、多卡权重之间的转换、多卡权重合并为单卡权重。
-
-用户在使用离线转换时，可以根据具体需求手动配置转换参数，确保转换过程灵活且可控，尤其适用于在严格控制的计算环境中进行模型部署和优化的场景。
-
-### 参数说明
-
-**离线权重转换**相关`yaml`参数说明如下：
-
-| 参数名称        | 说明        |
-| ----------------- |-----------------------------|
-| src_checkpoint | 源权重的绝对路径或文件夹路径。<br> - 如果是**完整权重**，则填写**绝对路径**；<br> - 如果是**分布式权重**，则填写**文件夹路径**，分布式权重须按照`model_dir/rank_x/xxx.ckpt`格式存放，文件夹路径填写为`model_dir`。<br>**如果rank_x文件夹下存在多个ckpt，将会使用文件名默认排序最后的ckpt文件用于转换。** |
-| src_strategy_path_or_dir   | 源权重对应的分布式策略文件路径。<br> - 如果是完整权重，则**不填写**；<br> - 如果是分布式权重，且使用了流水线并行，则填写**合并的策略文件路径**或**分布式策略文件夹路径**；<br> - 如果是分布式权重，且未使用流水线并行，则填写任一**ckpt_strategy_rank_x.ckpt**路径；                                 |
-| dst_checkpoint | 保存目标权重的文件夹路径。           |
-| dst_strategy   | 目标权重对应的分布式策略文件路径。<br> - 如果是完整权重，则**不填写**；<br> - 如果是分布式权重，且使用了流水线并行，则填写**合并的策略文件路径**或**分布式策略文件夹路径**；<br> - 如果是分布式权重，且未使用流水线并行，则填写任一**ckpt_strategy_rank_x.ckpt**路径；           |
-| prefix          | 目标权重保存的前缀名，权重保存为”{prefix}rank_x.ckpt”，默认”checkpoint_”。 |
-| world_size     | 目标权重的切片总数，一般等于dp \* mp \* pp。   |
-| process_num    | 离线权重转换使用的进程数，默认为1。<br> - 如果process_num = 1，使用**单进程转换**；<br>- 如果process_num > 1，使用**多进程转换**，比如转换的目标权重为8卡分布式权重，process_num=2时，会启动两个进程分别负责rank_0/1/2/3和rank_4/5/6/7切片权重的转换；                          |
-
-### 离线转换配置说明
-
-#### 生成分布式策略
-
-MindSpore每次运行分布式任务后都会在`output/strategy`文件夹下生成对应卡数的分布式策略文件（ckpt格式），可以在离线权重转换中使用。
-
-如果当前没有分布式策略文件，可以通过这种方式快速生成：在原有分布式训练/推理任务的基础上，在yaml配置文件中设置`only_save_strategy:True`来生成策略文件。设置之后任务会在生成分布式策略文件后立即停止，而不会实际执行训练或推理。
-
-#### 单进程转换
-
-使用[mindformers/tools/ckpt_transform/transform_checkpoint.py](https://gitee.com/mindspore/mindformers/blob/r1.5.0/mindformers/tools/ckpt_transform/transform_checkpoint.py)对载入权重进行单进程转换。
-
-**运行命令**：
-
-```shell
-python transform_checkpoint.py \
-  --src_checkpoint /worker/checkpoint/llama3-8b-2layer/rank_0/llama3_8b.ckpt \
-  --dst_checkpoint /worker/transform_ckpt/llama3_8b_1to8/ \
-  --dst_strategy /worker/mindformers/output/strategy/
-```
-
-#### 多进程转换
-
-使用[mindformers/tools/ckpt_transform/transform_checkpoint.sh](https://gitee.com/mindspore/mindformers/blob/r1.5.0/mindformers/tools/ckpt_transform/transform_checkpoint.sh)对载入权重进行多进程转换。
-
-**运行命令**：
-
-```shell
-bash transform_checkpoint.sh \
-  /worker/checkpoint/llam3-8b-2layer/rank_0/llama3_8b.ckpt \
-  None \
-  /worker/transform_ckpt/llama3_8b_1to8/ \
-  /worker/mindformers/output/strategy/ \
-  8 2
-```
-
-**注意事项**：
-
-- 使用[transform_checkpoint.sh](https://gitee.com/mindspore/mindformers/blob/r1.5.0/mindformers/tools/ckpt_transform/transform_checkpoint.sh)脚本时，参数`8`表示目标设备数，参数`2`表示使用2个进程进行转换。
-
-## 特殊场景
-
-### 物理机多机多卡训练
-
-大规模模型通常需要通过多台服务器组成的集群进行训练。在这种多机多卡的场景下，如果服务器之间存在共享盘，则可以使用自动转换功能，否则只能使用离线转换。下面以两台服务器、16卡训练为例进行说明。
-
-#### 场景一：服务器之间有共享盘
-
-在服务器之间有共享盘的场景下，可以使用 MindSpore Transformers 的自动权重转换功能在多机多卡训练之前自动进行权重转换。假设 `/data` 为服务器的共享盘，且 MindSpore Transformers 的工程代码位于 `/data/mindformers` 路径下。
-
-- **单进程转换**
-
-  在单进程转换模式下，只需在配置文件中配置预训练权重的路径并开启自动权重转换即可。
-
-  **参数配置：**
-
-  ```yaml
-  # 配置预训练权重路径，填写权重文件的绝对路径
-  load_checkpoint: "/worker/checkpoint/llama3-8b/rank_0/llama3_8b.ckpt"
-
-  # 设置 auto_trans_ckpt 为 True 开启自动权重转换
-  auto_trans_ckpt: True
-
-  # 配置数据集路径
-  train_dataset: &train_dataset
-    data_loader:
-      type: MindDataset
-      dataset_dir: "/worker/dataset/wiki103/"
-      shuffle: True
-
-  # 配置16卡分布式策略（仅供参考）
-  parallel_config:
-    data_parallel: 2
-    model_parallel: 4
-    pipeline_stage: 2
-    micro_batch_num: 2
-    vocab_emb_dp: True
-    gradient_aggregation_group: 4
-    micro_batch_interleave_num: 1
-  ```
-
-- **多进程转换（可选）**
-
-  若需要加速权重转换过程，可以选择多进程转换模式，通过配置 `transform_process_num` 参数实现。
-
-  **参数配置：**
-
-  ```yaml
-  # 使用2个进程进行转换
-  transform_process_num: 2
-  ```
-
-  **启动任务：**
-
-  使用[mindformers/scripts/msrun_launcher.sh](https://gitee.com/mindspore/mindformers/blob/r1.5.0/scripts/msrun_launcher.sh)进行任务启动。
-
-  ```shell
-  # 第一台服务器（主节点）
-  bash scripts/msrun_launcher.sh "run_mindformer.py \
-    --config {CONFIG_PATH} \
-    --run_mode train" \
-    16 8 ${ip} ${port} 0 output/msrun_log False 300
-  # 第二台服务器（子节点）
-  bash scripts/msrun_launcher.sh "run_mindformer.py \
-    --config {CONFIG_PATH} \
-    --run_mode train" \
-    16 8 ${ip} ${port} 1 output/msrun_log False 300
-  ```
-
-#### 场景二：服务器之间无共享盘
-
-在服务器之间无共享盘的情况下，需要使用离线权重转换工具进行权重转换。以下步骤描述了如何进行离线权重转换，并启动多机多卡训练任务。
-
-- **获取分布式策略文件**
-
-  在进行离线权重转换前，首先需要获取各节点的分布式策略文件。
-
-  **参数配置：**
-
-  ```yaml
-  # 设置 only_save_strategy 为 True 以获取分布式策略文件
-  only_save_strategy: True
-
-  # 配置数据集路径
-  train_dataset: &train_dataset
-    data_loader:
-      type: MindDataset
-      dataset_dir: "/worker/dataset/wikitext_2048/"
-      shuffle: True
-
-  # 配置16卡分布式策略（仅供参考）
-  parallel_config:
-    data_parallel: 2
-    model_parallel: 4
-    pipeline_stage: 2
-    micro_batch_num: 2
-    vocab_emb_dp: True
-    gradient_aggregation_group: 4
-    micro_batch_interleave_num: 1
-  ```
-
-  各节点的策略文件将分别保存在各自的`output/strategy`目录中。例如，节点0将保存`ckpt_strategy_rank_0-7.ckpt`文件，节点1将保存`ckpt_strategy_rank_8-15.ckpt`文件。随后，需将所有节点的策略文件集中到同一台服务器上，以便进行后续操作。
-
-- **离线权重转换**
-
-  在保存有所有策略文件的服务器上，使用[mindformers/tools/ckpt_transform/transform_checkpoint.py](https://gitee.com/mindspore/mindformers/blob/r1.5.0/mindformers/tools/ckpt_transform/transform_checkpoint.py)进行离线权重转换。
-
-  **单进程转换：**
-
-  ```shell
-  python mindformers/tools/ckpt_transform/transform_checkpoint.py \
-    --src_checkpoint /worker/checkpoint/llama3-8b/rank_0/llama_7b.ckpt \
-    --dst_checkpoint ./output/llama3_8b_dp2mp4pp2 \
-    --dst_strategy ./output/strategy
-  ```
-
-  **多进程转换（可选）：**
-
-  ```shell
-  # 使用2个进程进行转换
-  bash mindformers/tools/ckpt_transform/transform_checkpoint.sh \
-    /worker/checkpoint/llama3-8b/rank_0/llama_7b.ckpt \
-    None \
-    ./output/llama3_8b_dp2mp4pp2 \
-    ./output/strategy \
-    16 2
-  ```
-
-- **复制权重到其他节点**
-
-  将转换得到的分布式权重分别复制到各自节点。0节点只需要 `rank_0` 到 `rank_7` 的切片权重，1节点只需要 `rank_8` 到 `rank_15` 的切片权重。
-
-- **参数配置**
-
-  ```yaml
-  # 配置预训练权重路径，填写分布式权重文件夹路径 model_dir
-  load_checkpoint: "/worker/checkpoint/llama3_8b_dp2mp4pp2"
-
-  # 将 only_save_strategy 改为 False
-  only_save_strategy: False
-  ```
-
-### ModelArts 训练
-
-在 ModelArts 环境中进行训练与物理机上的多机多卡训练类似，同样支持开启权重自动转换。用户可以通过在训练作业的超参数中配置`auto_trans_ckpt=True`来启用自动权重转换，并通过设置`transform_process_num > 1`来开启多进程转换。
-
-**注意**：如果 ModelArts 资源池中的服务器节点NPU卡数不是8，则需要额外配置`npu_num_per_node=节点NPU卡数`。例如，如果每个节点配有16个NPU，则应设置`npu_num_per_node=16`。
-
-## LoRA权重合并
-
-### 概述
-
-LoRA（Low-Rank Adaptation）的基本原理是对原始模型的参数进行低秩重参数化。合并LoRA权重的核心过程是将 LoRA 分支的参数进行计算，并叠加到对应的模型参数中，使最终得到的权重文件的参数列表与原始模型一致，不包含额外的LoRA参数。这一操作不会对推理结果产生任何影响，因此合并后的模型在推理时依然能够保持与原始模型一致的性能。
-有关 LoRA 的详细原理和实现，请参阅以下资源：
-
-- 论文: [LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685)
-- GitHub: [https://github.com/microsoft/LoRA](https://github.com/microsoft/LoRA)
-
-### 使用说明
-
-使用MindSpore Transformers提供的[LoRA权重合并脚本](https://gitee.com/mindspore/mindformers/blob/r1.5.0/mindformers/tools/transform_ckpt_lora.py)，按照如下方式进行LoRA权重合并。
-
-```shell
-python mindformers/tools/transform_ckpt_lora.py \
-  --src_ckpt_strategy src_strategy_path_or_dir \
-  --src_ckpt_path_or_dir src_ckpt_path_or_dir \
-  --dst_ckpt_dir dst_ckpt_dir \
-  --prefix "checkpoint_" \
-  --lora_scaling lora_alpha/lora_rank
-```
-
-#### 参数说明
-
-- **src_ckpt_strategy**：源权重对应的分布式策略文件路径，通常在启动训练任务后默认保存在 `output/strategy/` 目录下。如果源权重为完整权重，则无需填写此参数；如果为分布式权重，需根据以下情况填写：
-    - **源权重开启了流水线并行**：权重转换基于合并的策略文件，填写分布式策略文件夹路径。脚本会自动将文件夹内的所有 `ckpt_strategy_rank_x.ckpt` 文件合并，并在文件夹下生成 `merged_ckpt_strategy.ckpt`。如果已经存在 `merged_ckpt_strategy.ckpt`，可以直接填写该文件的路径。
-    - **源权重未开启流水线并行**：权重转换可基于任一策略文件，填写任意一个 `ckpt_strategy_rank_x.ckpt` 文件的路径即可。
-
-    **注意**：如果策略文件夹下已存在 `merged_ckpt_strategy.ckpt` 且仍传入文件夹路径，脚本会首先删除旧的 `merged_ckpt_strategy.ckpt`，再合并生成新的 `merged_ckpt_strategy.ckpt` 以用于权重转换。因此，请确保该文件夹具有足够的写入权限，否则操作将报错。
-- **src_ckpt_path_or_dir**：源权重的路径。如果为分布式权重，请填写源权重所在文件夹的路径，源权重应按 `model_dir/rank_x/xxx.ckpt` 格式存放，并将文件夹路径填写为 `model_dir`。若源权重为完整权重，则填写完整权重的绝对路径。
-- **dst_ckpt_dir**：目标权重的保存路径，需为自定义的空文件夹路径。目标权重将按 `model_dir/rank_x/xxx.ckpt` 格式保存。
-- **prefix**：目标权重文件的命名前缀，默认值为 "checkpoint_"，即目标权重将按照 `model_dir/rank_x/checkpoint_x.ckpt` 格式保存。
-- **lora_scaling**：LoRA 权重的合并系数，默认为 `lora_alpha/lora_rank`，这两个参数即为 LoRA 模型配置时的参数，需自行计算。
-
-### 示例
-
-#### 场景一：包含 LoRA 参数的完整权重
-
-如果合并前的权重是完整的权重文件，可以按照以下方式填写参数（直接输入完整权重的路径）：
-
-```shell
-python mindformers/tools/transform_ckpt_lora.py \
-  --src_ckpt_path_or_dir .../xxx/xxx.ckpt \
-  --dst_ckpt_dir dst_ckpt_dir \
-  --prefix "checkpoint_" \
-  --lora_scaling lora_alpha/lora_rank
-```
-
-#### 场景二：包含 LoRA 参数的分布式权重
-
-如果合并前的权重是分布式的权重文件，可以按照以下方式填写参数（需输入分布式权重文件夹路径和分布式策略文件夹路径），最后得到的权重会自动合并为完整的权重文件：
-
-```shell
-python mindformers/tools/transform_ckpt_lora.py \
-  --src_ckpt_strategy .../xxx/mindformers/output/strategy/ \
-  --src_ckpt_path_or_dir .../xxx/model_dir \
-  --dst_ckpt_dir dst_ckpt_dir \
-  --prefix "checkpoint_" \
-  --lora_scaling lora_alpha/lora_rank
-```
-
-## Safetensors权重离线合并
-
-### 使用说明
-
-使用MindSpore Transformers提供的[safetensors权重合并脚本](https://gitee.com/mindspore/mindformers/blob/r1.5.0/toolkit/safetensors/unified_safetensors.py)，按照如下方式进行safetensors权重合并。
-
-```shell
-python toolkit/safetensors/unified_safetensors.py \
-  --src_strategy_dirs src_strategy_path_or_dir \
-  --mindspore_ckpt_dir mindspore_ckpt_dir\
-  --output_dir output_dir \
-  --file_suffix "1_1" \
-  --has_redundancy has_redundancy
-```
-
-#### 参数说明
-
-- **src_strategy_dirs**：源权重对应的分布式策略文件路径，通常在启动训练任务后默认保存在 `output/strategy/` 目录下。分布式权重需根据以下情况填写：
-    - **源权重开启了流水线并行**：权重转换基于合并的策略文件，填写分布式策略文件夹路径。脚本会自动将文件夹内的所有 `ckpt_strategy_rank_x.ckpt` 文件合并，并在文件夹下生成 `merged_ckpt_strategy.ckpt`。如果已经存在 `merged_ckpt_strategy.ckpt`，可以直接填写该文件的路径。
-    - **源权重未开启流水线并行**：权重转换可基于任一策略文件，填写任意一个 `ckpt_strategy_rank_x.ckpt` 文件的路径即可。
-
-    **注意**：如果策略文件夹下已存在 `merged_ckpt_strategy.ckpt` 且仍传入文件夹路径，脚本会首先删除旧的 `merged_ckpt_strategy.ckpt`，再合并生成新的 `merged_ckpt_strategy.ckpt` 以用于权重转换。因此，请确保该文件夹具有足够的写入权限，否则操作将报错。
-- **mindspore_ckpt_dir**：分布式权重路径，请填写源权重所在文件夹的路径，源权重应按 `model_dir/rank_x/xxx.safetensors` 格式存放，并将文件夹路径填写为 `model_dir`。
-- **output_dir**：目标权重的保存路径，默认值为 "/new_llm_data/******/ckpt/nbg3_31b/tmp"，即目标权重将放置在 `/new_llm_data/******/ckpt/nbg3_31b/tmp` 目录下。
-- **file_suffix**：目标权重文件的命名后缀，默认值为 "1_1"，即目标权重将按照 `*1_1.safetensors` 格式查找。
-- **has_redundancy**：合并的权重是否是去除冗余的权重，默认为 `True`。
-- **filter_out_param_prefix**：合并权重时可自定义过滤掉部分参数，过滤规则以前缀名匹配。如优化器参数"adam_"。
-- **max_process_num**：合并最大进程数。默认值：64。
-
-### 示例
-
-#### 场景一：去除冗余的safetensors权重
-
-如果合并去除冗余的safetensors权重，可以按照以下方式填写参数：
-
-```shell
-python toolkit/safetensors/unified_safetensors.py \
-  --src_strategy_dirs src_strategy_path_or_dir \
-  --mindspore_ckpt_dir mindspore_ckpt_dir\
-  --output_dir output_dir \
-  --file_suffix "1_1" \
-  --has_redundancy True
-```
-
-#### 场景二：不去除冗余的safetensors权重
-
-如果合并非去除冗余的safetensors权重，可以按照以下方式填写参数：
-
-```shell
-python toolkit/safetensors/unified_safetensors.py \
-  --src_strategy_dirs src_strategy_path_or_dir \
-  --mindspore_ckpt_dir mindspore_ckpt_dir\
-  --output_dir output_dir \
-  --file_suffix "1_1" \
-  --has_redundancy False
-```
-
-#### 场景三：过滤Adam优化器的safetensors权重
-
-如果合并过滤Adam优化器的safetensors权重，可以按照以下方式填写参数：
-
-```shell
-python toolkit/safetensors/unified_safetensors.py \
-  --src_strategy_dirs src_strategy_path_or_dir \
-  --mindspore_ckpt_dir mindspore_ckpt_dir\
-  --output_dir output_dir \
-  --file_suffix "1_1" \
-  --filter_out_param_prefix "adam_"
-```
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_zh_cn/function/weight_conversion.md b/docs/mindformers/docs/source_zh_cn/function/weight_conversion.md
deleted file mode 100644
index 15b7eb1dd2a496efcd842fdf9df2b8e1d67908d7..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_zh_cn/function/weight_conversion.md
+++ /dev/null
@@ -1,124 +0,0 @@
-# 权重格式转换
-
-[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_zh_cn/function/weight_conversion.md)
-
-## 概述
-
-MindSpore Transformers提供了统一的权重转换工具，能够将模型权重在HuggingFace所使用的格式与MindSpore Transformers所使用的格式之间相互转换。这可以帮助用户：
-
-- 将HuggingFace权重转换为MindSpore Transformers权重，在MindSpore Transformers上进行微调、测评或推理。
-- 把使用MindSpore Transformers训练或微调得到的权重转换为HuggingFace权重，并在其他框架上使用。
-
-## 转换步骤
-
-要进行权重转换，首先请将待转换模型的HuggingFace仓库完整克隆到本地，然后执行`mindformers/convert_weight.py`脚本。该脚本能够自动将HuggingFace的模型权重文件转换为适用于MindSpore Transformers的权重文件。如若希望将MindSpore Transformers权重转为HuggingFace权重，请将`reversed`设置为`True`。
-
-```shell
-python convert_weight.py [-h] --model MODEL [--reversed] --input_path INPUT_PATH  --output_path OUTPUT_PATH [--dtype DTYPE] [--n_head N_HEAD] [--hidden_size HIDDEN_SIZE] [--layers LAYERS] [--is_pretrain IS_PRETRAIN] [--telechat_type TELECHAT_TYPE]
-```
-
-### 参数说明
-
-- model：模型名称。
-- reversed：将MindSpore Transformers权重转换为HuggingFace权重。
-- input_path：HuggingFace权重文件夹的路径，指向已下载的权重文件。
-- output_path：转换后MindSpore Transformers权重文件的保存路径。
-- dtype：转换后的权重数据类型。
-- n_head：只对BLOOM模型生效，使用`bloom_560m`时请设为`16`，使用`bloom_7.1b`时请设为`32`。
-- hidden_size：只对BLOOM模型生效，使用`bloom_560m`时请设为`1024`，使用`bloom_7.1b`时请设为`4096`。
-- layers：只对GPT2和WizardCoder模型生效，模型被转换的层数。
-- is_pretrain：只对Swin模型生效，转换预训练权重。
-- telechat_type：只对TeleChat模型生效，TeleChat模型的版本。
-
-## 转换示例
-
-假设用户已经下载了[Llama2模型的权重](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/llama2.md#%E6%A8%A1%E5%9E%8B%E6%9D%83%E9%87%8D%E4%B8%8B%E8%BD%BD)，并保存在路径`/home/user/torch_weights`中，用户希望将其转换为MindSpore Transformers权重并保存在路径`/home/user/ms_weights`中，可以使用以下命令：
-
-```bash
-python convert_weight.py --model llama --input_path /home/user/torch_weights --output_path /home/user/ms_weights/llama.ckpt
-```
-
-通过以上步骤，可将HuggingFace权重成功转换为MindSpore Transformers权重，方便在MindSpore Transformers中继续模型训练或推理。
-
-## 已支持模型
-
-| 参数取值      | 支持模型                                      |
-|-----------|-------------------------------------------|
-| llama     | Llama2、Llama3、Llama3.1、CodeLlama          |
-| baichuan2 | Baichuan2                                 |
-| glm-n     | GLM2、GLM3、GLM3-32K、GLM4                   |
-| cogvlm2   | CogVLM2-Video、CogVLM2-Image               |
-| qwen      | Qwen、Qwen1.5、Qwen2                        |
-| qwenvl    | QwenVL                                    |
-| internlm  | InternLM                                  |
-| internlm2 | InternLM2                                 |
-| yi        | Yi                                        |
-| mixtral   | Mixtral                                   |
-| deepseek  | DeepSeekCoder、DeepSeekCoder1.5、DeepSeekV2 |
-| gpt       | GPT2                                      |
-| whisper   | Whisper                                   |
-
-## 未支持模型权重转换开发
-
-1. 在扩展模型目录下新增`convert_weight.py`及`convert_reversed.py`文件。
-2. 在文件中分别编写`convert_pt_to_ms`及`convert_ms_to_pt`权重转换函数，函数参数为`input_path`、`output_path`、`dtype`及额外参数`**kwargs`。
-3. 在MindSpore Transformers代码根目录下`convert_weight.py`文件中的`convert_map`和`reversed_convert_map`字典中加入扩展模型名称及转换函数引入路径。
-4. 在`main`函数中通过调用`parser.add_argument()`方法新增额外参数。
-
-## 模型权重转换开发示例
-
-此处以Llama为例。如若希望转换HuggingFace权重至MindSpore Transformers权重，需在[convert_weight.py](https://gitee.com/mindspore/mindformers/blob/r1.5.0/mindformers/models/llama/convert_weight.py)内定义`convert_pt_to_ms`函数：
-
-```python
-def convert_pt_to_ms(input_path, output_path, dtype=None, **kwargs):
-    """convert hf weight to ms."""
-    print(f"Trying to convert huggingface checkpoint in '{input_path}'.", flush=True)
-    try:
-        from transformers import LlamaForCausalLM
-    except:
-        raise ImportError(f"Failed to load huggingface checkpoint. Please make sure transformers is available.")
-
-    try:
-        model_hf = LlamaForCausalLM.from_pretrained(os.path.dirname(input_path))
-    except Exception as e:
-        print(f"Do not find huggingface checkpoint in '{os.path.dirname(input_path)}', Error {e.message}.", flush=True)
-        return False
-    ckpt_list = []
-    for name, value in model_hf.state_dict().items():
-        name = name_replace(name)
-        if name == 'norm.weight':
-            name = 'norm_out.weight'
-        if name[:7] == 'layers.':
-            name = name[7:]
-
-        print(f'\rprocessing parameter: {name} {value.shape}     ', end='', flush=True)
-        ckpt_list.append({'name': name, 'data': pt2ms(value, dtype)})
-
-    ms.save_checkpoint(ckpt_list, output_path)
-    print(f"\rConvert huggingface checkpoint finished, the mindspore checkpoint is saved in '{output_path}'.",
-          flush=True)
-    return True
-```
-
-而若是希望转换MindSpore Transformers权重至HuggingFace权重，则需在[convert_reversed.py](https://gitee.com/mindspore/mindformers/blob/r1.5.0/mindformers/models/llama/convert_reversed.py)内定义`convert_ms_to_pt`函数：
-
-```python
-def convert_ms_to_pt(input_path, output_path, dtype=None, **kwargs):
-    """convert ms weight to hf."""
-    print(f"Trying to convert mindspore checkpoint in '{input_path}'.", flush=True)
-    model_ms = ms.load_checkpoint(input_path)
-
-    state_dict = {}
-    for name, value in model_ms.items():
-        name = name_replace(name)
-        print(f'\rprocessing parameter: {name} {value.shape}     ', end='', flush=True)
-        if is_lora_param(name):
-            name = name.replace('.tk_delta_lora_a', '.lora_A.weight')
-            name = name.replace('.tk_delta_lora_b', 'lora_B.weight')
-        state_dict[name] = ms2pt(value, dtype)
-
-    torch.save(state_dict, output_path)
-    print(f"\rConvert mindspore checkpoint finished, the huggingface checkpoint is saved in '{output_path}'.",
-          flush=True)
-    return True
-```
diff --git a/docs/mindformers/docs/source_zh_cn/index.rst b/docs/mindformers/docs/source_zh_cn/index.rst
deleted file mode 100644
index e947483d0ad9ff1daa2b4a06ee7ba66ca554612b..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_zh_cn/index.rst
+++ /dev/null
@@ -1,227 +0,0 @@
-MindSpore Transformers 文档
-=========================================
-
-MindSpore Transformers（也称MindFormers）是一个MindSpore原生的大模型套件，旨在提供大模型训练、微调、评估、推理、部署等全流程开发能力，提供业内主流的Transformer类预训练模型和SOTA下游任务应用，涵盖丰富的并行特性，期望帮助用户轻松地实现大模型训练和创新研发。
-
-用户可以参阅 `整体架构 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/start/overview.html>`_ 和 `模型库 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/start/models.html>`_ ，快速了解MindSpore Transformers的系统架构，及所支持的功能特性和大模型清单。进一步地，可参考 `安装 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/quick_start/install.html>`_ 和 `快速启动 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/quick_start/source_code_start.html>`_ 章节，上手探索MindSpore Transformers。
-
-如果您对MindSpore Transformers有任何建议，请通过 `issue <https://gitee.com/mindspore/mindformers/issues>`_ 与我们联系，我们将及时处理。
-
-MindSpore Transformers支持一键启动任意任务的单卡/多卡训练、微调、评估、推理流程，它通过简化操作、提供灵活性和自动化流程，使得深度学习任务的执行变得更加高效和用户友好，用户可以通过以下说明文档进行学习：
-
-.. raw:: html
-
-   <table style="width: 100%">
-      <tr>
-         <td style="text-align: center; width: 20%; border: none">
-            <img src="https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/docs/mindformers/docs/source_zh_cn/full-process_1.png">
-         </td>
-         <td style="text-align: center; width: 20%; border: none">
-            <img src="https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/docs/mindformers/docs/source_zh_cn/full-process_2.png">
-         </td>
-         <td style="text-align: center; width: 20%; border: none">
-            <img src="https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/docs/mindformers/docs/source_zh_cn/full-process_3.png">
-         </td>
-         <td style="text-align: center; width: 20%; border: none">
-            <img src="https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/docs/mindformers/docs/source_zh_cn/full-process_4.png">
-         </td>
-         <td style="text-align: center; width: 20%; border: none">
-            <img src="https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/docs/mindformers/docs/source_zh_cn/full-process_5.png">
-         </td>
-      </tr>
-      <tr>
-         <td style="text-align: center; width: 20%; border: none">
-            <ul style="text-align: left; display: inline-block;">
-                <li><a href="https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/usage/dev_migration.html"><span>开发迁移</span></a></li>
-                <li><a href="https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/usage/multi_modal.html"><span>多模态理解模型开发</span></a></li>
-            </ul>
-         </td>
-         <td style="text-align: center; width: 20%; border: none">
-            <ul style="text-align: left; display: inline-block;">
-                <li><a href="https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/usage/pre_training.html"><span>预训练</span></a></li>
-            </ul>
-         </td>
-         <td style="text-align: center; width: 20%; border: none">
-            <ul style="text-align: left; display: inline-block;">
-                <li><a href="https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/usage/sft_tuning.html"><span>SFT微调</span></a></li>
-            </ul>
-         </td>
-         <td style="text-align: center; width: 20%; border: none">
-            <ul style="text-align: left; display: inline-block;">
-                <li><a href="https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/usage/evaluation.html"><span>评测</span></a></li>
-            </ul>
-         </td>
-         <td style="text-align: center; width: 20%; border: none">
-            <ul style="text-align: left; display: inline-block;">
-                <li><a href="https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/usage/inference.html"><span>推理</span></a></li>
-                <li><a href="https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/usage/quantization.html"><span>量化</span></a></li>
-                <li><a href="https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/usage/mindie_deployment.html"><span>MindIE服务化部署</span></a></li>
-            </ul>
-         </td>
-      </tr>
-   </table>
-
-代码仓地址： <https://gitee.com/mindspore/mindformers>
-
-使用MindSpore Transformers进行灵活易用的个性化配置
------------------------------------------------------
-
-MindSpore Transformers以其强大的功能集，为用户提供了灵活易用的个性化配置选项。其关键特性包括：
-
-1. `权重格式转换 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/function/weight_conversion.html>`_
-
-   提供统一的权重转换工具，能够将模型权重在HuggingFace所使用的格式与MindSpore Transformers所使用的格式之间相互转换。
-
-2. `分布式权重切分与合并 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/function/transform_weight.html>`_
-
-   不同分布式场景下的权重灵活地进行切分与合并。
-
-3. `分布式并行 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/function/distributed_parallel.html>`_
-
-   一键配置多维混合分布式并行，让模型在上至万卡的集群中高效运行。
-
-4. `数据集 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/function/dataset.html>`_
-
-   支持多种类型和格式的数据集。
-
-5. `权重保存与断点续训 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/function/resume_training.html>`_
-
-   支持step级断点续训，有效减少大规模训练时意外中断造成的时间和资源浪费。
-
-6. `训练指标监控 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/function/monitor.html>`_
-
-   提供大模型训练阶段的可视化服务，用于监控和分析训练过程中的各种指标和信息。
-
-7. `训练高可用 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/function/high_availability.html>`_
-
-   提供大模型训练阶段的高可用能力，包括临终 CKPT 保存、UCE 故障容错恢复和进程级重调度恢复功能。
-
-8. `Safetensors权重 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/function/safetensors.html>`_
-
-   支持safetensors格式的权重文件保存及加载功能。
-
-9. `细粒度激活值SWAP <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/function/fine_grained_activations_swap.html>`_
-
-   支持细粒度地选择特定激活值使能SWAP，用于降低模型训练的峰值内存开销。
-
-使用MindSpore Transformers进行深度调优
---------------------------------------
-
-- `精度调优 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/acc_optimize/acc_optimize.html>`_
-- `性能调优 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/perf_optimize/perf_optimize.html>`_
-
-附录
-------------------------------------
-
-- `环境变量说明 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/appendix/env_variables.html>`_
-- `配置文件说明 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/appendix/conf_files.html>`_
-
-FAQ
-------------------------------------
-
-- `模型相关 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/faq/model_related.html>`_
-- `功能相关 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/faq/func_related.html>`_
-- `MindSpore Transformers贡献指南 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/faq/mindformers_contribution.html>`_
-- `魔乐社区贡献指南 <https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/faq/modelers_contribution.html>`_
-
-.. toctree::
-   :glob:
-   :maxdepth: 1
-   :caption: 开始
-   :hidden:
-
-   start/overview
-   start/models
-
-.. toctree::
-   :glob:
-   :maxdepth: 1
-   :caption: 快速入门
-   :hidden:
-
-   quick_start/install
-   quick_start/source_code_start
-
-.. toctree::
-   :glob:
-   :maxdepth: 1
-   :caption: 使用教程
-   :hidden:
-
-   usage/dev_migration
-   usage/multi_modal
-   usage/pre_training
-   usage/sft_tuning
-   usage/evaluation
-   usage/inference
-   usage/quantization
-   usage/mindie_deployment
-   usage/pretrain_gpt
-
-.. toctree::
-   :glob:
-   :maxdepth: 1
-   :caption: 功能说明
-   :hidden:
-
-   function/weight_conversion
-   function/transform_weight
-   function/distributed_parallel
-   function/dataset
-   function/resume_training
-   function/monitor
-   function/high_availability
-   function/safetensors
-   function/fine_grained_activations_swap
-
-.. toctree::
-   :glob:
-   :maxdepth: 1
-   :caption: 精度调优
-   :hidden:
-
-   acc_optimize/acc_optimize
-
-.. toctree::
-   :glob:
-   :maxdepth: 1
-   :caption: 性能调优
-   :hidden:
-
-   perf_optimize/perf_optimize
-
-.. toctree::
-   :maxdepth: 1
-   :caption: API参考
-   :hidden:
-
-   mindformers
-   mindformers.core
-   mindformers.dataset
-   mindformers.generation
-   mindformers.models
-   mindformers.modules
-   mindformers.pet
-   mindformers.pipeline
-   mindformers.tools
-   mindformers.wrapper
-
-.. toctree::
-   :glob:
-   :maxdepth: 1
-   :caption: 附录
-   :hidden:
-
-   appendix/env_variables
-   appendix/conf_files
-
-.. toctree::
-   :glob:
-   :maxdepth: 1
-   :caption: FAQ
-   :hidden:
-
-   faq/model_related
-   faq/func_related
-   faq/mindformers_contribution
-   faq/modelers_contribution
diff --git a/docs/mindformers/docs/source_zh_cn/perf_optimize/images/cast.png b/docs/mindformers/docs/source_zh_cn/perf_optimize/images/cast.png
deleted file mode 100644
index a225d668969c329c22f2d966db71ad6a492a5351..0000000000000000000000000000000000000000
Binary files a/docs/mindformers/docs/source_zh_cn/perf_optimize/images/cast.png and /dev/null differ
diff --git a/docs/mindformers/docs/source_zh_cn/perf_optimize/images/mstx.png b/docs/mindformers/docs/source_zh_cn/perf_optimize/images/mstx.png
deleted file mode 100644
index 171c36574dbf9dc6893866f1471ecf6e47c906f9..0000000000000000000000000000000000000000
Binary files a/docs/mindformers/docs/source_zh_cn/perf_optimize/images/mstx.png and /dev/null differ
diff --git a/docs/mindformers/docs/source_zh_cn/perf_optimize/images/reshape.png b/docs/mindformers/docs/source_zh_cn/perf_optimize/images/reshape.png
deleted file mode 100644
index 6f9b5e46046b52db23b521a5bc8f0823b3139508..0000000000000000000000000000000000000000
Binary files a/docs/mindformers/docs/source_zh_cn/perf_optimize/images/reshape.png and /dev/null differ
diff --git a/docs/mindformers/docs/source_zh_cn/perf_optimize/images/silu_mul.png b/docs/mindformers/docs/source_zh_cn/perf_optimize/images/silu_mul.png
deleted file mode 100644
index e7936203dfd07d40a2b7840e9ceba23eff34e76d..0000000000000000000000000000000000000000
Binary files a/docs/mindformers/docs/source_zh_cn/perf_optimize/images/silu_mul.png and /dev/null differ
diff --git a/docs/mindformers/docs/source_zh_cn/perf_optimize/images/studio.png b/docs/mindformers/docs/source_zh_cn/perf_optimize/images/studio.png
deleted file mode 100644
index aee6e6a17285b270e5e54bf96477a0c7dcba42ef..0000000000000000000000000000000000000000
Binary files a/docs/mindformers/docs/source_zh_cn/perf_optimize/images/studio.png and /dev/null differ
diff --git a/docs/mindformers/docs/source_zh_cn/perf_optimize/perf_optimize.md b/docs/mindformers/docs/source_zh_cn/perf_optimize/perf_optimize.md
deleted file mode 100644
index 3c3aee08d38a7450f1ffcca3886b76dae5e7943d..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_zh_cn/perf_optimize/perf_optimize.md
+++ /dev/null
@@ -1,687 +0,0 @@
-# 大模型性能调优指南
-
-[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_zh_cn/perf_optimize/perf_optimize.md)
-
-## 概述
-
-本文档主要介绍大语言模型的性能调优，详细介绍了性能调优相关的基础理论知识、相关工具使用指导和性能调优整体思路，以及案例分享。开始大模型性能调优工作时，应具备大模型的基础知识。为避免发散，本文档将不会解释大模型相关基础概念，聚焦性能调优介绍。
-
-性能一般讨论的是模型训练性能，即在指定模型和输入数据的情况下，完成一次端到端训练所需要时间。端到端是指完成一个人工智能模型单步训练的过程，时间主要由以下部分构成：
-
-* 数据加载时间：指的是模型加载训练数据和权重的时间，包括将数据从硬件存储设备读取到CPU、在CPU中进行数据的预处理、以及CPU数据传输到NPU的过程。对于需要切分到若干张NPU上的模型，数据加载时间还包括从一张NPU广播到其他NPU上的时间。
-
-* 模型正向计算（Forward）反向计算（Backward）时间，包含前向的数据计算和反向的数据微分求导。
-
-* 优化器时间：指的是模型参数更新时间。
-
-* 模型后处理时间：指的是优化器更新后，包括数据的后处理或者必要的同步操作，通常取决于模型特定的操作。
-
-* 通信时间：概念比较宽泛，涵盖单节点的卡间通信耗时和多节点的节点间通信耗时。通过MindSpore的并行技术，通信和计算通常可以并行执行，此时部分通信时间会被掩盖，因此一般考虑未被计算掩盖的通信时间。
-
-* 调度时间：指模型从CPU指令到调用NPU内核所需要的时间。
-
-性能调优旨在通过优化模型算法、参数和并行策略等手段，降低上述各部分时间，一般重点关注模型前向反向时间以及通信时间进行优化。
-
-## 基础简介
-
-### 性能指标
-
-性能通常通过吞吐量指标进行评估，对于大语言模型，吞吐量主要是指每秒钟每张卡处理的token数量。计算公式如下：
-
-$$
-Throughput = SeqLength * (sample/s/p)
-$$
-
-(sample/s/p)的计算结果可以直接从日志中获取，也可以从日志中分别获取对应字段再进行计算。
-
-各字段含义如下：
-
-* SeqLength：指序列的长度，在文本处理过程中，输入的文本需要转换成数字序列，这些数字序列作为模型的输入。SeqLength就是指这些数字序列的长度，即文本的长度。在模型训练和推理的过程中，需要设置一个固定的SeqLength，以便进行批处理和计算。较长的SeqLength可以提高模型的准确性，但会增加计算量和内存消耗；而较短的SeqLength则会减少计算量和内存消耗，但可能会降低模型的准确性。
-
-* sample：其值等于全局批量大小，即global_batch_size的值。在分布式训练中，数据被分成多个部分，每个部分被送到不同的NPU上进行计算。这些NPU上的Batch Size之和就是全局批量大小。全局批量大小的选择是一个重要的决策，因为它会直接影响模型的训练性能。如果全局批量过小，每个NPU上的Batch Size可能会太小，导致模型的收敛速度变慢；如果全局批量过大，每个NPU上的Batch Size可能会太大，导致NPU内存不足或者模型的精度下降。一个找到最佳Batch Size的经验法则是使其达到NPU对给定数据类型的内存限制，即Batch Size占满NPU内存。
-
-* s：即per_step_time，以秒为单位，指在训练过程中，每一步所花费的时间。
-
-* p：即parallel_num，指数据并行维度大小。
-
-### 并行特性简介
-
-在大模型训练中，由于数据量和模型复杂度的增加，单个计算节点的计算能力往往难以满足训练的需求。为了提高训练效率和加速训练过程，通常采用并行策略将计算任务分配给多个计算节点。
-
-并行策略通常分为以下几种:
-
-* 数据并行（Data Parallelism，简称DP）
-
-* 模型并行（一般指张量并行Tensor Parallelism，简称TP）
-
-* 流水并行（Pipeline Parallelism，简称PP）
-
-* 优化器并行（Optimizer Parallelism，简称OP）
-
-* 序列并行（Sequence Parallelism，简称SP）
-
-* 多副本并行
-
-在实际应用中，通常会采用多种并行策略和优化手段，例如使用优化器并行和重计算等方式，以减少模型对内存的使用并提高训练效率。并行策略设计与模型的效率密切相关，因此在模型调优之前先确定一组或多组较优的并行策略，是至关重要的。
-
-详细介绍参考文档[并行策略指南](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/function/distributed_parallel.html)。
-
-对于不同的参数量规格的模型，可参考以下并行策略选择方向：
-
-* 模型规模较小时(~7B)，可使用纯数据并行+优化器并行，如果内存富裕可以进一步开启梯度累积；
-* 模型规模适中时(~13B)，可进一步使用流水线并行，并调整重计算，让单卡显存能够支持切分后的模型训练，并减少引入的通信量；
-* 模型规模较大时，需开启模型并行以降低权重的显存占用，同时短序列并行与多副本并行也建议开启以提升性能；
-* 在训练长序列时(>=32k)，可使用长序列并行及相关特性以减小长序列激活值的显存使用。
-
-### 重计算
-
-MindSpore采用反向模式的自动微分，根据正向图计算流程自动推导出反向图，正向图和反向图共同构成了完整的计算图。在计算某些反向算子时，需要使用一些正向算子的计算结果，导致这些计算结果需要保存在内存中，直到依赖它们的反向算子计算完成，占用的内存才会被复用。这一现象提高了训练的内存峰值，在大规模网络模型中尤为显著。
-
-为了解决这个问题，MindSpore提供了重计算的功能，可以不保存正向算子的计算结果，从而释放内存以供复用。在计算反向算子时，如果需要正向的结果，再重新计算正向算子。
-
-重计算分为以下两种方式：
-
-* 完全重计算
-
-  适用于内存资源极为受限的极端环境。在这种模式下，除了保存输入数据外，所有激活值均在需要时重新计算，最大限度地减少了对内存的依赖，然而计算量也会显著增加。
-
-* 选择性重计算
-
-  该策略保留了那些占用较小内存空间但重计算成本较高的激活值，如Cast、SiLU-Mul。同时，对占用较大内存但重计算成本相对较低的激活值执行重计算。此方法在保证模型性能的同时，实现了内存使用的高效管理。
-
-#### Cast重计算
-
-RMSNorm一般使用高精度（FP32）计算，计算之前需要将输入从低精度（FP16或BF16）通过Cast转成高精度（FP32）。RMSNorm需要保存输入以用于反向计算。因此对Cast进行重计算可以只保存Cast的低精度输入，而非高精度输入，从而可以减少一半的内存占用，达到节省内存的效果。
-
-![cast](./images/cast.png)
-
-然而从高精度到低精度的Cast算子进行重计算，会导致后面的算子原本只需要保存Cast之后的低精度内存，但是由于Cast算子重计算，需要保存高精度内存，反而会导致内存占用增加。
-
-#### SiLU-Mul重计算
-
-在FeedForward中，中间部分内存占用通常较大。由于SiLU和Mul重计算代价较小，对SiLU和Mul算子重计算，可以省下w2的MatMul和Mul的第一个输入的内存。
-
-![SiLU_mul](./images/silu_mul.png)
-
-### 工具介绍
-
-#### profiler工具
-
-MindSpore Transformers本身集成了profiling数据采集的功能，使用步骤如下：
-
-1. 修改配置文件
-
-   在模型的配置文件中开启profiling开关，需修改的参数如下：
-
-   ```yaml
-   profile: True                  # 是否开启性能分析工具
-   profile_start_step: 5          # 性能分析开始的step
-   profile_stop_step: 6           # 性能分析结束的step
-   init_start_profile: False      # Profiler初始化的时候开启，开启后profile_start_step将不生效。
-   profile_communication: False   # 是否在多NPU训练中收集通信性能数据
-   profile_memory: True           # 收集Tensor内存数据
-   mstx: True                     # 是否收集mstx时间戳记录，包括训练step、通信算子等
-   ```
-
-   profile_start_step和profile_stop_step用于确定采集区间，因为采集耗时较长，不推荐将区间设置过大，建议设置为2到4步。且由于第一个step涉及编译，推荐从第3步开始采集。
-
-   profiling全部可配置参数如下：
-
-   | 参数                    | 说明                                                                                         | 类型   |
-   |-----------------------|--------------------------------------------------------------------------------------------|------|
-   | profile               | 是否开启性能采集工具，默认值为`False`。                                                                    | bool |
-   | profile_start_step    | 设置开始采集性能数据的step数，默认值为`1`。                                                                  | int  |
-   | profile_stop_step     | 设置停止采集性能数据的step数，默认值为`10`。                                                                 | int  |
-   | profile_communication | 设置是否在多设备训练中收集通信性能数据，使用单卡训练时，该参数无效，默认值为`False`。                                             | bool |
-   | profile_memory        | 设置是否收集Tensor内存数据，默认值为`True`。                                                               | bool |
-   | profile_rank_ids      | 设置开启性能采集的rank ids，默认值为`None`，表示所有rank id均开启性能采集。                                           | list |
-   | profile_pipeline      | 设置是否按流水线并行每个stage的其中一张卡开启性能采集，默认值为`False`。                                                 | bool |
-   | profile_output        | 设置保存性能采集生成文件的文件夹路径。                                                                        | str  |
-   | profile_level         | 设置采集数据的级别，可选值为(0, 1, 2)，默认值为`1`。                                                           | int  |
-   | with_stack            | 设置是否收集Python侧的调用栈数据，默认值为`False`。                                                           | bool |
-   | data_simplification   | 设置是否开启数据精简，开启后将在导出性能采集数据后删除FRAMEWORK目录以及其他多余数据，默认为`False`。                                 | int  |
-   | init_start_profile    | 设置是否在Profiler初始化时开启采集性能数据，设置`profile_start_step`时该参数不生效。开启`profile_memory`时需要将该参数设为`True`。 | bool |
-   | mstx                  | 设置是否收集mstx时间戳记录，包括训练step、HCCL通信算子等，默认值为`False`。                                            | bool |
-
-2. 查看数据
-
-   采集工具默认会在`./output`路径下创建一个`profile`文件夹，该路径可通过模型yaml配置文件的`profile_output`或`output_dir`字段进行设置，前者更优先。
-
-   生成的文件及介绍参考[profile文件介绍](https://www.mindspore.cn/tutorials/zh-CN/r2.6.0/debug/profiler.html)，主要收集算子、任务等运行耗时、CPU利用率及内存消耗等信息，用于性能调优分析。
-
-   此外还可以通过统计集群中每个rank的计算时间、通信时间、未掩盖通信时间，分析集群中不同rank间的性能情况，以此判断是否存在计算负载不均衡的情况，影响了集群的整体效率，并对此进行针对性优化。
-
-3. 查看mstx信息
-
-   mstx记录信息不会由采集工具直接生成，需要手动通过命令行从`profile`文件夹中提取。以第一张卡为例，如下为相应的目录结构:
-
-   ```sh
-   output
-   └── profile
-       └── rank_0
-           └── {hostname}_{pid}_{时间戳}_ascend_ms
-               └── PROF_{数字}_{时间戳}_{字符串}
-   ```
-
-   执行以下命令：
-
-   ```shell
-   msprof --export=on --output={path}/output/profile/rank_0/{hostname}_{pid}_{时间戳}_ascend_ms/PROF_{数字}_{时间戳}_{字符串} # 替换为实际路径
-   ```
-
-   执行完毕后会在PROF_{数字}_{时间戳}_{字符串}目录下生成`mindstudio_profiler_output`文件夹，其中命名为`msprof_tx_{时间戳}.csv`的文件即为mstx记录信息，包含训练step、HCCL通信算子等数据的时间戳和相应的描述内容，如下图所示：
-
-   ![mstx](./images/mstx.png)
-
-#### DryRun内存评估工具
-
-当前内存评估工具主要使用MindSpore的模拟编译(dryrun)。模拟编译使用方式在MindSpore的[环境变量文档](https://www.mindspore.cn/docs/zh-CN/r2.6.0/api_python/env_var_list.html)和[msrun文档](https://www.mindspore.cn/tutorials/zh-CN/r2.6.0/parallel/msrun_launcher.html)中呈现。可以通过在训练进程开始前使能环境变量`export MS_SIMULATION_LEVEL=1`或者在msrun启动项配置`--sim_level`功能，即可拉起模拟编译的训练进程。
-
-可以使用DryRun分析所需内存是否超过最大可用内存。如果超过，需要重新调整配置。最大可用内存可通过如下字段配置，推荐值为`58GB`，如果设置过大，可能导致其他组件内存不足。通常使用的集群训练规模越大，其他组件内存占用越大，MindSpore进程可用的最大内存也会随之降低，例如在千卡集群上，该最大可用内存值一般设置为`54GB`。
-
-```yaml
-context:
-  max_device_memory: "58GB"
-```
-
-新建脚本`dry_run.sh`，脚本内容如下：
-
-```shell
-#!/bin/bash
-
-YAML_FILE=$1
-RANK_SIZE=$2
-PIPELINE_STAGES=$3
-RANK_GAP=$((RANK_SIZE/PIPELINE_STAGES))
-ROOT_PATH=`pwd`
-
-export MS_SIMULATION_LEVEL=1
-export RANK_SIZE=$RANK_SIZE
-
-rm -rf output_dryrun
-mkdir output_dryrun
-for((i=0; i<$PIPELINE_STAGES; i++))
-do
-    export DEVICE_ID=$i
-    export RANK_ID=$((i*RANK_GAP))
-    echo "start training for rank $RANK_ID, device $DEVICE_ID"
-    # 需要正确指定 run_mindformer.py 路径
-    python ./run_mindformer.py --config $ROOT_PATH/$1 &> ./output_dryrun/rank_$RANK_ID.log &
-done
-```
-
-执行脚本：
-
-```shell
-bash dry_run.sh $train.yaml $rank_size $stage
-```
-
-三个参数含义如下：
-
-* $train.yaml：需要调试的配置文件
-* $rank_size：模拟卡数
-* $stage：阶段数，等于流水线并行数量
-
-执行完成后，输出目录`output_dryrun`下会生成每个stage的日志信息，每个日志末尾会打印如下信息：
-
-```text
-Device MOC memory size: 62432M
-MindSpore Used memory size: 59392M
-MindSpore memory base address: 0
-Used peak memory usage (without fragments): 48874M
-Actual peak memory usage (with fragments): 48874M
-```
-
-Used peak memory usage (without fragments)：表示不包含碎片的NPU内存使用峰值，重点关注该值，建议不超过最大可用内存。
-
-Actual peak memory usage (with fragments)：表示包含碎片的NPU内存使用峰值。
-
-注意事项：
-
-1. 使用`dryrun`模拟编译时，若数据集过大，会导致运行时间过长，因此需要控制数据集大小，只需跑完几个step即可；
-2. 在pipeline并行场景下，每个PP stage在训练过程中所需的内存不同，因此至少每个stage都需要一个rank进行dryrun；换言之，同一个PP stage内所有rank的内存情况都完全一致，仅需跑一个rank的模拟编译即可分析整体内存情况；
-3. `dryrun`任务也会生成分布式策略文件，启动`dryrun`任务即可生成各PP stage的策略文件，由于相同stage的分布式策略文件完全相同，因此只需要每个PP stage获得一个策略文件即可；
-4. 运行结束后将会在日志中打印当前任务所消耗的内存大小，可根据该信息评估内存使用，进行内存调优。
-
-#### MindStudio Insight
-
-MindStudio Insight提供了性能数据的多种呈现形式，包括Timeline视图、通信分析和计算耗时等可视化呈现，以便用户分析潜在的性能瓶颈，并指导如何采取措施消除或减少这些瓶颈。MindStudio Insight支持在Timeline视图中查看集群场景下Profiling导出的数据，并以单卡为维度进行展示，可以支持20GB以上的集群性能文件分析。
-
-点击[MindStudio Insight下载链接](https://www.hiascend.com/developer/download/community/result?module=pt+sto+cann)，选择合适的版本安装。
-
-打开MindStudio Insight工具，单击界面左上方工具栏中的“+”，在弹窗中选择要解析并导出的文件或目录，然后单击“确认”导入。
-
-MindStudio Insight工具以时间线（Timeline）的形式呈现全流程在线推理、训练过程中的运行情况，并按照调度流程来呈现整体的运行状况，并且该工具支持集群Timeline展示。通过分析时间线，用户可以对在线推理/训练过程进行细粒度的分析，如迭代间隙是否过长、算子执行时间等，并提供易用性功能辅助用户快速定位性能瓶颈。
-
-时间线（Timeline）界面包含工具栏（区域一）、时间线树状图（区域二）、图形化窗格（区域三）和数据窗格（区域四）四个部分，如图所示。
-
-![studio](./images/studio.png)
-
-* 区域一
-
-  工具栏，包含常用快捷按钮，从左至右依次为标记列表、过滤（支持按卡或按专项层过滤展示）、搜索、连线事件、复原、时间轴缩小和时间轴放大。
-
-* 区域二
-
-  时间线树状图，显示集群场景下各“Card”的分层信息，一层级为“Card”，二层级为进程或专项分层，三层级为线程等。包括上层应用数据（包含上层应用算子的耗时信息）、CANN层数据（包含AscendCL、GE和Runtime组件的耗时数据）、底层NPU数据（包含Ascend Hardware下各个Stream任务流的耗时数据和迭代轨迹数据、HCCL和Overlap Analysis通信数据以及其他昇腾AI处理器系统数据）、打点数据和AI Core Freq层级。
-
-* 区域三
-
-  图形化窗格，展示的数据是迭代内的数据，图形化窗格对应时间线树状图，逐行对时间线进行图形化展现，包括上层应用算子、各组件及接口的执行序列和执行时长。
-
-* 区域四
-
-  数据窗格，统计信息或算子详情信息展示区，选中详情（Slice Detail）为选中单个算子的详细信息，选中列表（Slice List）为某一泳道选中区域的算子列表信息，系统视图（System View）为某类算子的汇总信息。
-
-单击时间线页面树状图或者图形化窗格任意位置，可以使用键盘中的W（放大）、A（左移）、S（缩小）、D（右移）键进行操作，支持放大的最大精度为1ns。本工具可以提供概览、内存、算子、通信等多个维度的分析，辅助进行性能调优。详细使用方法参考[MindStudio Insight用户指南](https://www.hiascend.com/document/detail/zh/mindstudio/70RC3/msinsightug/msascendinsightug/Insight_userguide_0002.html)。
-
-#### IR 图
-
-在[MindSpore Transformers配置文件](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/appendix/conf_files.html)中，只需要开启save_graphs，运行时会输出一些图编译过程中生成的.ir后缀的中间文件，这些被称为IR文件。默认情况下，这些文件会保存在当前执行目录下的graph目录中。IR文件是一种比较直观易懂的文本格式文件，用于描述模型结构的文件，可以直接用文本编辑软件查看。配置项含义参考[Config配置说明](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/appendix/conf_files.html)，配置方法如下：
-
-```yaml
-context:
-  save_graphs: True
-  save_graphs_path: "./graph"
-```
-
-以下是部分IR图的节选：
-
-```text
-  %13(equiv_180_CNode_16165) = Load(%para6_model.layers.0.attention.wq.weight, UMonad[U]) cnode_attrs: {checkpoint: Bool(1)} cnode_primal_attrs: {unique_id: "782039"}
-      : (<Ref[Tensor[Float16]], (512, 4096), ref_key=model.layers.0.attention.wq.weight>, <UMonad, NoShape>) -> (<Tensor[Float16], (512, 4096)>)
-      # Fullname with scope: (Default/network-MFPipelineWithLossScaleCell/network-_VirtualDatasetCell/_backbone-GradAccumulationCell/network-LlamaForCausalLM/model-LlamaModel/layers-CellList/0-LLamaDecodeLayer/attention-LLamaAttention/Load-op0)
-  %14(equiv_16877_x) = PrimFunc_MatMul(%12, %13, Bool(0), Bool(1)) {instance name: matmul} primitive_attrs: {in_strategy: ((1, 1), (8, 1))} cnode_attrs: {checkpoint: Bool(1)} cnode_primal_attrs: {unique_id: "782146", origin_output_shape: (4096, 4096), micro: I64(0), origin_input_shapes: ((4096, 4096), (4096, 4096))} {in_strategy: ((1, 1), (8, 1))}
-      : (<Tensor[Float16], (4096, 4096)>, <Tensor[Float16], (512, 4096)>, <Bool, NoShape>, <Bool, NoShape>) -> (<Tensor[Float16], (4096, 512)>)
-      # Fullname with scope: (Default/network-MFPipelineWithLossScaleCell/network-_VirtualDatasetCell/_backbone-GradAccumulationCell/network-LlamaForCausalLM/model-LlamaModel/layers-CellList/0-LLamaDecodeLayer/attention-LLamaAttention/wq-Linear/MatMul-op0)
-  %15(equiv_16876_CNode_30913) = PrimFunc_Reshape(%14, (I64(1), I64(4096), I64(4), I64(128))) {instance name: reshape} cnode_attrs: {checkpoint: Bool(1)} cnode_primal_attrs: {unique_id: "817859", forward_comm_node_unique_id: "729440", micro: I64(0)}
-      : (<Tensor[Float16], (4096, 512)>, <Tuple[Int64*4], TupleShape(NoShape, NoShape, NoShape, NoShape), elements_use_flags={[const vector]{1, 1, 1, 1}}>) -> (<Tensor[Float16], (1, 4096, 4, 128)>)
-      # Fullname with scope: (Default/network-MFPipelineWithLossScaleCell/network-_VirtualDatasetCell/_backbone-GradAccumulationCell/network-LlamaForCausalLM/model-LlamaModel/layers-CellList/0-LLamaDecodeLayer/attention-LLamaAttention/Reshape-op0)
-  %16(equiv_16875_query) = PrimFunc_Transpose(%15, (I64(0), I64(2), I64(1), I64(3))) {instance name: transpose} primitive_attrs: {in_strategy: ((1, 1, 8, 1))} cnode_attrs: {checkpoint: Bool(1)} cnode_primal_attrs: {unique_id: "782042", micro: I64(0)} {in_strategy: ((1, 1, 8, 1))}
-      : (<Tensor[Float16], (1, 4096, 4, 128)>, <Tuple[Int64*4], TupleShape(NoShape, NoShape, NoShape, NoShape), elements_use_flags={[const vector]{1, 1, 1, 1}}>) -> (<Tensor[Float16], (1, 4, 4096, 128)>)
-      # Fullname with scope: (Default/network-MFPipelineWithLossScaleCell/network-_VirtualDatasetCell/_backbone-GradAccumulationCell/network-LlamaForCausalLM/model-LlamaModel/layers-CellList/0-LLamaDecodeLayer/attention-LLamaAttention/Transpose-op0)
-```
-
-`%XX` 表示步骤，后面对应算子名称，括号内包含入参及输出。Fullname with scope包含了完成的class、方法名等信息。
-
-* `%13`
-
-  此步直接加载wq.weight，得到<Tensor[Float16], (512, 4096)>。
-
-* `%14`
-
-  将前面的%12输出与%13输出进行MatMul操作，得到<Tensor[Float16], (4096, 512)>。
-
-* `%15`
-
-  将上述14%的输出进行Reshape操作得到<Tensor[Float16], (1, 4096, 4, 128)>。
-
-* `%16`
-
-  将上述15%的输出进行Transpose操作得到<Tensor[Float16], (1, 4, 4096, 128)>。
-
-在保存IR图时建议将模型的层数减小，以缩短编译存图的时间，方便快速调试。详细内容参考[IR文件介绍](https://www.mindspore.cn/tutorials/zh-CN/r2.6.0/debug/error_analysis/mindir.html#ir文件介绍)和[分析示例](https://www.mindspore.cn/tutorials/zh-CN/r2.6.0/debug/error_analysis/mindir.html#如何根据analyze-failir文件分析图推导失败的原因)。
-
-#### SAPP自动负载均衡工具
-
-大模型训练性能调优需要同时考虑多维混合并行策略配置与内存限制，工程师需要在集群上尝试不同的组合方案，才能找到性能达标的并行策略，这一过程常常耗费数周时间，且消耗大量算力成本。
-
-MindSpore提供了SAPP（Symbolic Automatic Parallel Planner）自动负载均衡工具。只需输入模型的内存和时间信息，以及部分流水线并行性能相关的超参（如重计算对性能的影响），工具将自行构建线性规划问题，通过全局求解的方式，为大模型自动生成流水线并行中的stage-layer配比，调整各layer重计算策略，自动优化集群算力和内存利用率，降低空等时间，实现Pipeline并行分钟级策略寻优，大幅度降低性能调优成本，显著提升端到端训练性能。
-
-详细使用方法，请参考[SAPP流水线负载均衡](https://gitee.com/mindspore/mindformers/tree/r1.5.0/toolkit/pipeline_balance)工具介绍。
-
-## 整体思路
-
-大模型的性能优化方法主要依赖于profiling数据分析以及内存分析，分析当前性能的瓶颈，并做出针对性优化动作，然后验证性能收益，分析进一步的优化方向。整体调优流程如下：
-
-1. 分析profiling数据，查看是否存在耗时明显异常高的算子，如存在，可尝试替换等价算子，并将异常算子的耗时信息提交issue进行反馈；
-2. 分析通信耗时，查看是否存在更优的分布式策略，查看IR图分析是否存在不合理的重排布问题，解决这些影响通信效率的问题，以提升整个集群的训练效率；
-3. 分析内存使用情况，查看是否存在异常大内存Tensor，是否存在可融合的算子降低激活值内存，在有内存富裕的情况可以调整选择重计算的配置策略，利用空余内存以换取训练性能，或是降低模型切分的份数，减少模型切分带来的通信开销从而提高性能。
-
-性能优化是一个循环往复的过程，算子性能无明显异常后，就可对分布式策略进行试验分析，优化异常的通信耗时与重排布开销；然后进行内存的优化分析，消除异常的大内存Tensor；完成内存优化后需要进一步查看，空余显存是否支持重新调整并行策略设置，以获取通信开销更小的策略设定，充分利用内存以获得更优性能；这样循环往复地优化，进而一步步达到设定的性能目标。
-
-完成一轮性能优化后，还需要确保模型精度对齐，若对齐则应用该优化策略。
-
-## 瓶颈分析与优化
-
-在明确整体的调优思路后，就可以通过性能分析工具和内存评估工具分析训练模型的性能瓶颈，并针对瓶颈点应用优化手段，验证收益，分析新的瓶颈点进一步优化，这样一步步地接近模型训练性能的最优解。下面列出常见的性能瓶颈，并给出对应可用的优化措施。
-
-### 内存瓶颈
-
-内存瓶颈是大模型训练场景下需要解决的第一道问题；随着模型规模的扩大，训练大模型所需要的内存资源也随之上涨，而单卡所提供的内存容量是有限的，因此需要通过分布式并行策略，结合重计算，优化器并行等手段，在多卡集群上摊分模型训练所需的资源以解决内存不足问题。
-
-下面列出针对内存瓶颈场景下的优化手段：
-
-* **模型并行(MP)/张量并行(TP)**：
-    * 适用场景：模型参数量大，需大量降低权重占用内存的场景；
-    * 收益：使用多卡切分模型权重，内存使用量降低最多；
-    * 开销：使用更多的硬件资源，引入大量通信开销；
-    * 使用建议：建议在参数量超过20B的模型上使用，且限制在8以内，避免产生跨机通信开销。
-* **流水线并行(PP)**：
-    * 适用场景：模型权重，优化器状态等静态内存放不下的场景；
-    * 收益：使用多卡切分模型阶段，通信开销较MP小很多；
-    * 开销：引入计算时空闲(bubble)，以及较小的stage间通信开销；
-    * 使用建议：权重需要切分的场景都可尝试使用，并通过超参调整降低bubble性能损耗。
-* **长序列并行(CP)**：
-    * 适用场景：训练长序列任务(>=32k)，激活值过高的场景；
-    * 收益：长序列训练场景分摊激活值开销，使得通过扩充机器资源以拓展长序列能力成为可能；
-    * 开销：引入通信开销。
-
-以上三种并行策略都是使用更多的计算设备来分摊内存消耗，以解决内存瓶颈问题；花费的代价就是需要更多的硬件资源，并引入了额外的通信量，在同等规模的集群上训练吞吐率不如数据并行训练。
-
-* **优化器并行**：
-    * 适用场景：在有数据并行DP的场景下，将模型权重与优化器状态在DP域内切分到每张卡上，大幅降低显存消耗；
-    * 收益：模型权重与优化器状态在DP域内切分，节省大量内存使用；
-    * 开销：计算时引入一定量的通信来完成权重聚合；
-    * 使用建议：大部分情况下都建议开启，节省的显存可用于调整并行切分策略以整体提升性能。
-* **[完全重计算&选择重计算](#重计算)**：
-    * 适用场景：切分策略确定后，内存使用仍有部分超出，可调整完全重计算&选择重计算策略，进一步优化内存使用；
-    * 收益：节省内存使用；
-    * 开销：计算时间进一步增长；
-    * 使用建议：优先使用选择重计算，不超过内存使用时尽可能控制重计算带来的计算开销。
-* **短序列并行**：
-    * 适用场景：在MP切分下，使能短序列并行，在LayerNorm处对序列维按MP进行切分，通信量不变，减少激活值内存与Norm部分计算量；
-    * 收益：节省内存使用与计算时间，同时不增加通信量，不需要额外卡数资源；
-    * 使用建议：建议在MP场景下都开启。
-
-### 计算时长瓶颈
-
-正常情况下，计算时长应主要集中于matmul、flash attention等计算密集的算子上，如果在profiling分析中发现耗时异常的计算算子导致性能瓶颈的，可尝试替换等价算子，并同步提交算子性能issue至MindSpore Transformers或MindSpore。
-
-在模型调优层面，可以尝试以下方法解决缓解计算时长瓶颈：
-
-* **融合算子替换**：
-    * 使用融合算子等价替换部分算子组合，融合算子通常会带来性能和内存上的收益。
-* **重计算&选择重计算**：
-    * 涉及到时间和空间的平衡取舍，在有空余内存时，减少重计算的层数能够有效利用空余内存来提升计算性能。
-
-### 未掩盖通信瓶颈
-
-通过profiling工具可以获取训练进程的通信时长占比，其中包括已掩盖通信和未掩盖通信；已掩盖通信和计算同时执行，不影响训练效率，而未掩盖的通信则会导致计算等待通信，这部分通信耗时过长将影响训练性能，需要优化。
-
-* **IR图分析冗余通信算子**：
-  通过配置环境变量`export MS_DEV_SAVE_GRAPHS=1`，保存训练IR图，分析模型前向过程中的通信算子分布，看是否符合预期；
-  如在不合理的位置出现一连串的通信算子，则很可能是模型中配置的算子切分策略有误，导致触发了tensor重排布，框架自动插入了较多通信算子以保证计算等价；
-  这部分由于通信重排引入的冗余通信很可能导致出现大量的未掩盖通信，造成性能瓶颈，解决办法就是将对应位置算子的shard策略修改配置正确，解决通信重排问题。
-* **多副本&细粒度多副本并行**：
-  分析并解决通信重排问题后，如仍存在较多未掩盖通信，可尝试使用多副本或细粒度多副本并行策略；
-  在模型并行场景下，使能多副本或细粒度多副本并行，通信时间和计算时间可以部分相互掩盖，从而减少通信瓶颈。
-
-### IO瓶颈
-
-IO效率仅在特定情况下会成为模型训练的性能瓶颈，即IO读取一个step所需的训练数据的时间大于完成一个step前反向所有计算通信的时间。由于数据读取进程与训练进程异步，因此只要IO速度大于训练速度，每次训练下一个step时都能保证训练数据已经就绪，IO就不会阻塞训练进程；反之，IO速度大于训练速度时，每次训练下一个step，都需等待训练数据读取就绪，这部分阻塞时间就计入了训练整体时间，成为性能瓶颈。
-
-这种IO瓶颈通常出现于大集群共享存储的场景下，大集群的多个训练进程共同访问同一共享存储，导致IO压力上涨，效率降低。IO瓶颈在Profiling中表现为，timeline上，每个step间存在较大的数据读取空隙，期间计算闲置。
-
-IO瓶颈的解决思路就是优化IO量与IO行为。
-
-**full_batch=false**：
-
-full_batch是MindSpore的数据聚合行为的控制项，在配置为true时，每张卡都取global batch size的数据量，然后在图内完成数据的切分，只取对应DP域内所需数据进行训练；这种做法会导致大规模集群下对IO的压力陡增，每张卡读取IO量都存在DP倍的冗余，这种冗余发生在每张卡上，汇总起来对共享存储的压力过大，影响IO性能；建议在遇到IO瓶颈时，改用full_batch=false的行为模式，已验证能够较为明显地优化IO效率，配置方式可参考MindSpore[set_auto_parallel_context接口](https://www.mindspore.cn/docs/zh-CN/r2.6.0/api_python/mindspore/mindspore.set_auto_parallel_context.html#mindspore.set_auto_parallel_context)，yaml样例如下：
-
-```yaml
-#yaml文件配置
-parallel:             # 在parallel模块下
-  ...
-  full_batch: False   # 配置full batch为False
-  dataset_strategy: [[dp, 1], [dp, 1]] # dp替换为实际的dp配置数
-  ...
-```
-
-其中，`dataset_strategy`数组中的两个[dp, 1]分别对应数据集两项输入的[bs, seq_len]维度，需根据数据集输入的个数和shape实际情况进行配置，dp切分对应bs维度即可。
-
-也可从数据集入手优化IO量，数据集应尽量减小空间复杂度，如`attention_mask`这样空间复杂度为O(N^2)的输入项，就不太适合直接落盘至存储中；可以通过读取其他空间复杂度更小的相关信息，在训练进程读取数据的流程中，利用cpu即时生成，以减小IO访问量，整体加快数据读取速度。
-
-### pp场景bubble过多
-
-pipeline场景下主要开销是引入了计算闲置（bubble），其大概估算公式为：$bubble\ ratio=\frac{p-1}{m+p-1}$，其中，$p$为pipeline的stage数量，$m$为设定的micro batch num。
-
-为减小bubble空闲，可以从公式入手，在stage数量固定的情况下，可以增大micro batch num，使得整体的bubble占比降低，能够有效提高训练效率；
-
-然而在部分训练场景下，global batch size是一个较为关键的训练超参数，可能无法随意调整；这时可以尝试使用多流水交织（pp interleave）特性来优化bubble占比。
-
-**多流水交织 pipeline interleave**：
-
-pipeline_interleave(virtual pipeline)官网配置介绍：[set_auto_parallel_context](https://www.mindspore.cn/docs/zh-CN/r2.6.0/api_python/mindspore/mindspore.set_auto_parallel_context.html?highlight=pipeline_interleave)。
-
-MindSpore Transformers中，开启多流水交织需要在parallel中配置，例如使用1f1b排布方式：
-
-```yaml
-parallel:
-  ...
-  pipeline_config:
-    pipeline_interleave: True
-    pipeline_scheduler: '1f1b'
-  ...
-```
-
-之后在model_config中配置pp_interleave_num，例如按如下yaml配置为2：
-
-```yaml
-model:
-  model_config:
-    ...
-    pp_interleave_num: 2
-    ...
-```
-
-收益：pp interleave场景下的bubble占比公式为$bubble\ ratio=\frac{p-1}{vm+p-1}$，其中$v$为配置的pp_interleave_num，从公式中可以发现，提高$v$也可以达到减小bubble占比的作用。
-
-开销：pp interleave算法理论上会使用更多的内存，是一种空间换时间的策略，使用时需要根据内存变化情况重新调整内存使用策略。
-
-### 负载均衡策略调整
-
-在分布式训练中，pipeline并行策略涉及到不同卡间的负载不均现象。
-
-在pipeline并行下，由于模型按层切分stage，使得首尾两个stage设计layer外的模块实现，如embedding、head、loss计算等模块，使得首尾两个stage的计算时长会高于中间stage，这是时间上的负载不均衡；而由于pipeline流水执行前反向的特性，最早执行的stage最晚释放所有内存，使得不同stage的内存消耗不同，越靠前的stage消耗内存越多，这是空间上的不均衡。
-
-这种情况下可以通过配置模型层数偏移offset，来手动调整各个stage间的负载层数；
-
-例如，在PP stage为4，首个stage消耗内存过高的场景，可以这样设置`offset：[-2, 1, 1, 0]`，将stage 0的两层负载分别放到stage 1和stage 2上，这样可以降低首个stage的空间消耗，同时计算负载从首尾两个stage的限制转移到中间stage的额外层上，也没有过多降低计算效率。
-
-尽量不要出现一个stage上分配过多层数的情况，否则会形成计算效率的短板stage，拖慢整个训练进程；可以结合重计算对内存空间的利用，进行更为精细化的负载均衡调整。
-
-建议尝试使用[自动负载工具](#sapp自动负载均衡工具)以获取一个最优的负载均衡策略配置。
-
-## 典型案例
-
-### SiLU-Mul重计算未生效
-
-在开启细粒度多副本时，对SiLU和Mul做重计算可以节省内存，但关闭细粒度多副本时，对SiLU和Mul做重计算不能节省内存。定位过程如下：
-
-1. 确认配置了重计算
-
-   在IR图中检查Cast、SiLU和Mul算子是否有“recompute: Bool(1)”的标签，如果有标签说明算子配置了重计算。
-
-2. 检查重计算生效算子
-
-   在IR图中检查Cast、SiLU和Mul等算子是否有duplicated标签，没有带标签的算子说明实际计算图没有重计算这部分算子。如下示例只有Cast算子带了duplicated标签。
-
-   ```text
-   %1834(CNode_108839) = PrimFunc_Cast(%1833, I64(43)) {instance name: cast} primitive_attrs: {output_names: [output], input_names: [x, dst_type], recompute: Bool(1)} cnode_attrs: {recompute_sub_graph: U64(64), recompute_id: I64(65), duplicated: Bool(1), need_cse_after_recompute: Bool(1)} cnode_primal_attrs: {micro: I64(0)}
-       : (<Tensor[Float16], (1, 4096, 4096)>, <Int64, NoShape>) -> (<Tensor[Float32], (1, 4096, 4096)>)
-   ```
-
-3. 检查反向计算输入
-
-   在IR图中检查SiLU和Mul的反向算子的输入是否符合预期，在关闭细粒度多副本时，SiLU和Mul之间、Mul和MatMul之间均有Reshape算子，而开启细粒度多副本时，SiLU、Mul和MatMul是相连的。绘制相关流程如下：
-
-![reshape](./images/reshape.png)
-
-由此可知根因在于，细粒度多副本场景中Linear的输入shape是二维的，而非细粒度多副本中Linear的输入shape是三维的，所以Linear和Mul之间有Reshape算子，没对这个Reshape算子重计算导致对SiLU的重计算没有生效。额外对Reshape重计算后内存可以正常减小。参考配置如下：
-
-```yaml
-recompute_config:
-  recompute: False
-  select_recompute: ['feed_forward\.mul', 'feed_forward\.w1\.activation', 'feed_forward\.w1\.reshape', 'feed_forward\.w2\.reshape']
-```
-
-### Llama2-13B极致性能优化
-
-13B默认用单机DP: 8、MP: 1、PP: 1，开完全重计算，性能在1860tokens/s/p左右，相较于7B（2465tokens/s/p）与70B（1974tokens/s/p），性能明显偏低。
-
-经分析，13B性能瓶颈主要在于内存，无论是单机还是多机，如果不开MP，对SiLU和Mul做选择重计算内存依然不够，则需要开完全重计算。完全重计算会额外多20%到25%的计算量，导致性能偏低。
-
-经过实测，开MP关闭重计算，性能比纯DP还要低。双机并行策略调整为DP: 8、MP: 1、PP: 2、micro: 128，开完全重计算，性能提升至2136tokens/s/p。将完全重计算改为选择重计算，并精细选择算子，使每层的内存尽可能减少，性能提升至2189tokens/s/p。
-
-```yaml
-select_recompute: ['feed_forward\.mul', 'feed_forward\.w1\.activation', 'feed_forward\.w1\.reshape', 'feed_forward\.w1\.matmul', 'feed_forward\.w3\.matmul', 'feed_forward\.W3\.reshape', 'feed_forward\.w2\.matmul', 'feed_forward\.w2\.reshape', 'ffn_norm\.norm', 'ffn_norm\.rcast', 'attention_norm\.norm', 'attention_norm\.rcast', 'attention\.wq\.reshape', 'attention\.wk\.reshape', 'attention\.wv\.reshape', 'attention\.wo\.matmul', 'attention\.wo\.reshape', 'attention\.merger_head_transpose', 'add', 'attention\.flash attention']
-```
-
-调整不同stage的重计算层数，使stage1的重计算量减少，性能提升至2210tokens/s/p。
-
-```yaml
-select_recompute:
-  'feed_forward\.mul': [20, 8]
-  'feed_forward\.w1\.activation': [20, 8]
-  'feed_forward\.w1\.matmul': [20, 0]
-  'feed_forward\.w1\.reshape': [20, 8]
-  'feed_forward\.w3\.matmul': [20, 0]
-  'feed_forward\.w3\.reshape': [20, 0]
-  'feed_forward\.w2\.matmul': [20, 0]
-  'feed_forward\.w2\.reshape': [20, 0]
-  'ffn_norm\.norm': [20, 0]
-  'ffn_norm\.rcast': [20, 0]
-  'attention_norm\.norm': [20, 0]
-  'attention_normi.rcast': [20, 0]
-  'attention\.wq\.reshape': [20, 0]e
-  'attention\.wk\.reshape': [20, 0]e
-  'attention\.w\.reshape': [20, 0]e
-  'attention\.wol.matmul': [20, 0]
-  'attention\.wo\.reshape': [20, 0]e
-  'attention\.merger head transpose': [20, 0]
-  'add': [20, 0]
-  'attention\.flash_attention': [20, 0]
-```
-
-使用图编译等级为O0/O1图算融合，内存有进一步优化，将大部分算子的选择重计算改为部分层的完全重计算，其余层配置SiLU和Mul的选择重计算，stage0、stage1分别完全重计算13层、5层，性能提升至2353tokens/s/p。逐步减少stage0、stage1完全重计算至4层、0层，性能提升至2562tokens/s/p(max_device_memory: 57.2GB)。参考配置如下：
-
-```yaml
-recompute_config:
-  recompute: [4, 0]
-  select_recompute: ['feed_forward\.mul', 'feed_forward\.w1\.activation', 'feed_forward\.w1\.reshape', 'feed_forward\.w2\.reshape']
-```
-
-最终经过调优后，Llama2-13B性能优化至2562tokens/s/p，总计提升37%。
-
-### Llama千卡集群训练调优
-
-基于Llama2-70B模型配置，调整模型超参，扩充参数量至xxxB，使用1024卡集群+共享存储进行训练，设定GBS (global batch size)为128；下面针对对该案例进行性能瓶颈分析，给出优化方式参考。
-
-**案例瓶颈分析**：
-
-首先通过DryRun测试模型训练所需的大致内存，确定整体的切分策略，在此基础上进行调整，初步得到的切分策略：`DP=8 MP=8 PP=16 micro_batch_num=16`。
-
-对初步的切分策略进行测试，收集性能和内存数据，分析该场景下的性能瓶颈如下：
-
-* **IO瓶颈**：千卡同时访问共享存储读取数据，存储压力过大赶不上训练速度，导致性能波动；
-* **大词表内存瓶颈**：自定义超参的vocab_size偏大，导致embedding和lm_head结构占用内存过多；
-* **未掩盖通信瓶颈**：mp并行数设置为8后，通信量相对较高，出现较多未掩盖通信；
-* **bubble过多**：PP stage切分达到了16，而micro_batch_num受限于gbs，只能开到16，这样pipeline流程中出现了过多的bubble；
-* **stage间负载不均衡**：stage 0和stage 1内存消耗过高，需要调整负载均衡策略。
-
-**优化方法**：
-
-针对上述分析的瓶颈点，我们可以应用以下优化方法：
-
-1. 使用full_batch=false读取数据：优化IO读取量，减轻IO压力，解决IO瓶颈导致的性能波动问题；
-
-   full_batch相关使用介绍参考[IO瓶颈章节](#io瓶颈)。这里dp8的配置样例为：
-
-   ```yaml
-   parallel:             # 在parallel模块下
-     ...
-     full_batch: False   # 配置full batch为False
-     dataset_strategy: [[8, 1],] # dp为8，仅一项输入
-     ...
-   ```
-
-2. embedding参数配置优化器并行：大词表占用内存过多，且词表权重的优化器并行需额外配置，配置后有效缓解首个stage显存不足问题；
-
-   优化器并行使用介绍可参考[MindSpore优化器并行文档](https://www.mindspore.cn/docs/zh-CN/r2.6.0/features/parallel/optimizer_parallel.html)；此外，Llama模型还对embedding层的优化器有额外配置，[LlamaConfig API文档](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/models/mindformers.models.LlamaConfig.html#mindformers.models.LlamaConfig)中的`parallel_optimizer`项即为控制embedding优化器并行的控制项；
-   配置样例如下：
-
-   ```yaml
-   parallel:
-     ...
-     enable_parallel_optimizer: True  # 启用全局优化器并行
-     ...
-
-   model:
-     model_config:
-       ...
-       parallel_optimizer: True       # 给embedding层配置优化器并行
-       ...
-   ```
-
-3. 使能Llama的`细粒度多副本`策略，掩盖模型并行策略下的大部分通信行为；
-
-   多副本并行的介绍可以参考[MindSpore多副本并行文档](https://www.mindspore.cn/tutorials/zh-CN/r2.6.0/parallel/multiple_copy.html)，在MindSpore Transformers中通过`fine_grain_interleave`项来配置细粒度多副本的行为，参考配置如下：
-
-   ```yaml
-   model:
-     model_config:
-       ...
-       fine_grain_interleave: 2       # 配置细粒度多副本份数，默认值为1表示不启用，为2时则启用计算通信掩盖
-       ...
-   ```
-
-4. 使能`pp_interleave`并行策略，将`pp_interleave_num`配置为3，有效减小bubble占比；
-
-   多流水交织特性介绍可以参考[MindSpore流水线并行文档](https://www.mindspore.cn/docs/zh-CN/r2.6.0/features/parallel/pipeline_parallel.html)，在MindSpore Transformers中的参考配置如下：
-
-   ```yaml
-   parallel:
-     ...
-     pipeline_config:
-       pipeline_interleave: true    # 启用多流水交织
-       pipeline_scheduler: '1f1b'   # 调度方式使用1f1b
-     ...
-
-   model:
-     model_config:
-       ...
-       pp_interleave_num: 3    # 流水交织份数配置为3
-       ...
-   ```
-
-5. 调整stage间的负载，配置`offset`，将前两个stage的层数分摊至后续显存空余的层中；
-
-   负载均衡介绍可参照[前文负载均衡章节](#负载均衡策略调整)，这里结合`pp_interleave_num: 3`的配置后，offset配置如下：
-
-   ```yaml
-   model:
-     model_config:
-       ...
-       offset: [[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1]]
-       ...
-   ```
-
-   `pp_interleave_num`为3时，offset应配置为3个子列表，与流水切分数目对应；每个子列表长度为pipeline stage的数目，代表该位置需要增加或减少的层数；对上述配置来说，stage 0减少了两层负载，分配到了倒数两个stage上。
-
-6. 精细调整每个stage的重计算策略，使每个stage尽可能地用满显存以获取最佳性能。
-
-   这部分可以借助[SAPP自动负载均衡工具](#sapp自动负载均衡工具)来完成；优化后得到的重计算策略配置如下：
-
-   ```yaml
-   select_recompute:
-     'feed_forward\.mul': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 1]]
-     'feed_forward\.w1\.activation\.silu': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 1]]
-     'feed_forward\.w1\.reshape': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 1]]
-     'feed_forward\.w2\.reshape': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 1]]
-     'add': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 1]]
-     'cast_up': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 1]]
-   select_comm_recompute:
-     '.*\.norm': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1]]
-     'attention\.wq\.reshape': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1]]
-     'attention\.wk\.reshape': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1]]
-     'attention\.wv\.reshape': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1]]
-     'feed_forward\.w1\.reshape': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1]]
-     'feed_forward\.w3\.reshape': [[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1]]
-   ```
-
-**优化结果**：
-
-经过上述的瓶颈分析与针对性的优化调整，训练性能有了明显的提升，达到优化前的1.7倍（在当时环境下的实测数据，仅供参考）。
-
-上述调优案例体现了我们如何通过分析性能瓶颈点，找到可用的优化手段，逐步逼近性能最优配置的调优思路；希望本文能够帮助读者掌握整体调优思路，在各个不同调优场景下都能够通过分析明确性能优化的方向，获取良好的训练性能。
diff --git a/docs/mindformers/docs/source_zh_cn/quick_start/install.md b/docs/mindformers/docs/source_zh_cn/quick_start/install.md
deleted file mode 100644
index f977211c620ab81d41bdd0c2ad4f0932517cddb9..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_zh_cn/quick_start/install.md
+++ /dev/null
@@ -1,52 +0,0 @@
-# 安装
-
-[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_zh_cn/quick_start/install.md)
-
-## 确认版本匹配关系
-
-当前支持的硬件为[Atlas 800T A2](https://www.hiascend.com/hardware/ai-server?tag=900A2)训练服务器。
-
-当前套件建议使用的Python版本为3.11.4。
-
-| MindSpore Transformers | MindSpore | CANN | 固件与驱动 | 镜像链接 |
-|:-----------:|:---------:|:----:|:-----:|:----:|
-|    在研版本     |   在研版本    | 在研版本 | 在研版本  | 不涉及  |
-
-**当前MindSpore Transformers建议使用如上的软件配套关系。**
-
-历史版本配套关系：
-
-|                     MindSpore Transformers                      |                  MindSpore                  |                                                                         CANN                                                                         |                                                                         固件与驱动                                                                         |                                 镜像链接                                 |
-|:----------------------------------------------------:|:-------------------------------------------:|:----------------------------------------------------------------------------------------------------------------------------------------------------:|:-----------------------------------------------------------------------------------------------------------------------------------------------------:|:--------------------------------------------------------------------:|
-| [1.3.2](https://pypi.org/project/mindformers/1.3.2/) | [2.4.10](https://www.mindspore.cn/install/) | [8.0.0](https://www.hiascend.com/document/detail/zh/canncommercial/800/softwareinst/instg/instg_0000.html?Mode=PmIns&OS=Ubuntu&Software=cannToolKit) | [24.1.0](https://www.hiascend.com/document/detail/zh/canncommercial/800/softwareinst/instg/instg_0000.html?Mode=PmIns&OS=Ubuntu&Software=cannToolKit) | [Link](http://mirrors.cn-central-221.ovaijisuan.com/detail/168.html) |
-| [1.2.0](https://pypi.org/project/mindformers/1.2.0/) | [2.3.0](https://www.mindspore.cn/install/)  |                     [8.0.RC2.beta1](https://www.hiascend.com/developer/download/community/result?module=cann&cann=8.0.RC2.beta1)                     |                                       [24.1.RC2](https://www.hiascend.com/hardware/firmware-drivers/community)                                        | [Link](http://mirrors.cn-central-221.ovaijisuan.com/detail/138.html) |
-
-## 安装依赖软件
-
-1. 安装固件与驱动：通过[版本匹配关系](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/quick_start/install.html#%E7%A1%AE%E8%AE%A4%E7%89%88%E6%9C%AC%E5%8C%B9%E9%85%8D%E5%85%B3%E7%B3%BB)中的固件与驱动链接下载安装包，参考[昇腾官方教程](https://www.hiascend.com/document/detail/zh/quick-installation/24.0.RC1/quickinstg_train/800_9000A2/quickinstg_800_9000A2_0007.html)进行安装。
-
-2. 安装CANN和MindSpore：使用官方提供的Docker镜像（镜像中已包含CANN、MindSpore，无需手动安装）或者按照MindSpore官网的[手动安装](https://www.mindspore.cn/install/)章节进行安装。
-
-## 安装MindSpore Transformers
-
-目前在研版本仅支持源码编译安装，用户可以执行如下命令安装MindSpore Transformers：
-
-```bash
-git clone -b r1.5.0 https://gitee.com/mindspore/mindformers.git
-cd mindformers
-bash build.sh
-```
-
-## 验证是否成功安装
-
-判断MindSpore Transformers是否安装成功可以执行以下代码：
-
-```bash
-python -c "import mindformers as mf;mf.run_check()"
-```
-
-出现以下类似结果，证明安装成功：
-
-```text
-- INFO - All checks passed, used **** seconds, the environment is correctly set up!
-```
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_zh_cn/quick_start/source_code_start.md b/docs/mindformers/docs/source_zh_cn/quick_start/source_code_start.md
deleted file mode 100644
index 2328fc77e244dbd400c44d8d13055428749c769d..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_zh_cn/quick_start/source_code_start.md
+++ /dev/null
@@ -1,110 +0,0 @@
-# 快速启动
-
-[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_zh_cn/quick_start/source_code_start.md)
-
-本节展示如何使用MindSpore Transformers快速拉起一个基于 Llama2-7B 模型的LoRA低参微调任务。如果想要通过MindSpore Transformers使用其他模型和任务，请阅读对应的[模型文档](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/start/models.html)。
-
-## 准备权重文件
-
-MindSpore Transformers提供已经转换完成的预训练权重、词表文件用于预训练、微调和推理，用户也可以下载HuggingFace官方权重经过模型权重转换后进行使用。为了方便起见，这里不对转换原始权重过多赘述，有需要请参考[Llama2文档](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/llama2.md#模型权重转换)以及[权重转换](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/function/weight_conversion.html)了解更多细节。这里请直接下载`MindSpore`权重，下载转换后的`.ckpt`文件以及`tokenizer.model`文件进行后续的处理。
-
-| 模型名称 | MindSpore权重 | HuggingFace权重 |
-| ------ | ------ | ------ |
-| Llama2-7B | [llama2_7b.ckpt](https://ascend-repo-modelzoo.obs.cn-east-2.myhuaweicloud.com/MindFormers/llama2/llama2_7b.ckpt) | [Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf) |
-
-词表下载链接：[tokenizer.model](https://ascend-repo-modelzoo.obs.cn-east-2.myhuaweicloud.com/MindFormers/llama2/tokenizer.model)
-
-## 准备数据集
-
-1. 微调过程中使用的数据集文件alpaca_data.json在[Stanford Alpaca](https://github.com/tatsu-lab/stanford_alpaca)下载获得。
-
-2. 数据预处理。
-
-    需要在MindSpore Transformers代码根目录下执行以下操作，并将下文中的{path}替换成存放数据集文件的本地路径。
-
-    1. 执行[mindformers/tools/dataset_preprocess/llama/alpaca_converter.py](https://gitee.com/mindspore/mindformers/blob/r1.5.0/mindformers/tools/dataset_preprocess/llama/alpaca_converter.py)，添加prompt模板，将原始数据集转换为多轮对话格式。
-
-        ```shell
-          python mindformers/tools/dataset_preprocess/llama/alpaca_converter.py \
-            --data_path /{path}/alpaca_data.json \
-            --output_path /{path}/alpaca-data-conversation.json
-        ```
-
-        **参数说明**
-
-        - data_path: 输入下载的文件路径。
-        - output_path: 输出文件的保存路径。
-
-    2. 执行[mindformers/tools/dataset_preprocess/llama/llama_preprocess.py](https://gitee.com/mindspore/mindformers/blob/r1.5.0/mindformers/tools/dataset_preprocess/llama/llama_preprocess.py)，生成MindRecord数据，将带有prompt模板的数据转换为MindRecord格式。
-
-        ```shell
-          python mindformers/tools/dataset_preprocess/llama/llama_preprocess.py \
-            --dataset_type qa \
-            --input_glob /{path}/alpaca-data-conversation.json \
-            --model_file /{path}/tokenizer.model \
-            --seq_length 4096 \
-            --output_file /{path}/alpaca-fastchat4096.mindrecord
-        ```
-
-        **参数说明**
-
-        - dataset_type: 预处理数据类型。选项包括 "wiki" 和 "qa" 两种。
-            - "wiki" 用于处理 Wikitext2 数据集，该数据集适用于预训练和评测阶段。
-            - "qa" 用于处理 alpaca 数据集，将该数据集转换为问答格式，该数据集适用于微调阶段。
-                其他的数据集转换脚本请参考对应的[模型文档](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/start/models.html)。
-        - input_glob: 转换后的alpaca的文件路径。
-        - model_file: 模型tokenizer.model文件路径。
-        - seq_length: 输出数据的序列长度。
-        - output_file: 输出文件的保存路径。
-
-    3. 控制台输出如下内容，证明格式转换成功。
-
-        ```shell
-          # 控制台输出
-          Transformed 52002 records.
-          Transform finished, output files refer: {path}/alpaca-fastchat4096.mindrecord
-        ```
-
-## 启动微调
-
-在MindSpore Transformers代码根目录下，执行如下命令拉起微调任务：
-
-```shell
-bash scripts/msrun_launcher.sh "run_mindformer.py \
- --config configs/llama2/lora_llama2_7b.yaml \
- --train_dataset_dir /{path}/alpaca-fastchat4096.mindrecord \
- --load_checkpoint /{path}/llama2_7b.ckpt \
- --auto_trans_ckpt True \
- --use_parallel True \
- --run_mode finetune" 8
-```
-
-**命令说明：**
-
-- `scripts/msrun_launcher.sh`：分布式任务拉起脚本。
-- `"run_mindformer.py ..."`：每张卡上执行的Python任务的参数字符串，其中参数包括：
-    - `run_mindformer.py`：一键启动脚本。
-    - `--config`：指定任务配置文件路径 `configs/llama2/lora_llama2_7b.yaml` 。
-    - `--train_dataset_dir`：指定数据集路径 `/{path}/alpaca-fastchat4096.mindrecord` 。
-    - `--load_checkpoint`：指定权重文件路径 `/{path}/llama2_7b.ckpt` 。
-    - `--auto_trans_ckpt True`：打开权重自动切分功能。
-    - `--use_parallel True`：设置为分布式任务。
-    - `--run_mode finetune`：设定运行模式为微调。
-- `8`：设置任务使用8张NPU。
-
-当控制台出现如下日志时：
-
-```shell
-Start worker process with rank id:0, log file:output/msrun_log/worker_0.log. Environment variable [RANK_ID=0] is exported.
-Start worker process with rank id:1, log file:output/msrun_log/worker_1.log. Environment variable [RANK_ID=1] is exported.
-Start worker process with rank id:2, log file:output/msrun_log/worker_2.log. Environment variable [RANK_ID=2] is exported.
-Start worker process with rank id:3, log file:output/msrun_log/worker_3.log. Environment variable [RANK_ID=3] is exported.
-Start worker process with rank id:4, log file:output/msrun_log/worker_4.log. Environment variable [RANK_ID=4] is exported.
-Start worker process with rank id:5, log file:output/msrun_log/worker_5.log. Environment variable [RANK_ID=5] is exported.
-Start worker process with rank id:6, log file:output/msrun_log/worker_6.log. Environment variable [RANK_ID=6] is exported.
-Start worker process with rank id:7, log file:output/msrun_log/worker_7.log. Environment variable [RANK_ID=7] is exported.
-```
-
-说明微调任务已拉起，微调进度可在`output/msrun_log/`目录下查看。
-
-关于Llama2更多细节，以及更多的启动方式，请具体参考`Llama2` 的 [README](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/llama2.md#llama-2)文档获取更多支持。
diff --git a/docs/mindformers/docs/source_zh_cn/start/image/overall_architecture.png b/docs/mindformers/docs/source_zh_cn/start/image/overall_architecture.png
deleted file mode 100644
index 7d4073d7357404abfe78eee8b5684f9a101b78d9..0000000000000000000000000000000000000000
Binary files a/docs/mindformers/docs/source_zh_cn/start/image/overall_architecture.png and /dev/null differ
diff --git a/docs/mindformers/docs/source_zh_cn/start/models.md b/docs/mindformers/docs/source_zh_cn/start/models.md
deleted file mode 100644
index 48a42520f981b9f14eeab72fab65e945e649ca9c..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_zh_cn/start/models.md
+++ /dev/null
@@ -1,55 +0,0 @@
-# 模型库
-
-[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_zh_cn/start/models.md)
-
-当前MindSpore Transformers全量的模型列表如下：
-
-| 模型名                                                                                                     | 支持规格                          |     模型类型     | 最新支持版本 |
-|:--------------------------------------------------------------------------------------------------------|:------------------------------|:------------:|:------:|
-| [CodeLlama](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/codellama.md)             | 34B                           |    稠密LLM     |  在研版本  |
-| [CogVLM2-Image](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/cogvlm2_image.md)     | 19B                           |      MM      |  在研版本  |
-| [CogVLM2-Video](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/cogvlm2_video.md)     | 13B                           |      MM      |  在研版本  |
-| [DeepSeek-V3](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/deepseek3)                      | 671B                          |    稀疏LLM     |  在研版本  |
-| [DeepSeek-V2](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/deepseek2)                      | 236B                          |    稀疏LLM     |  在研版本  |
-| [DeepSeek-Coder-V1.5](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/deepseek1_5)            | 7B                            |    稠密LLM     |  在研版本  |
-| [DeepSeek-Coder](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/deepseek)                    | 33B                           |    稠密LLM     |  在研版本  |
-| [GLM4](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/glm4.md)                       | 9B                            |    稠密LLM     |  在研版本  |
-| [GLM3-32K](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/glm32k)                            | 6B                            |    稠密LLM     |  在研版本  |
-| [GLM3](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/glm3.md)                       | 6B                            |    稠密LLM     |  在研版本  |
-| [InternLM2](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/internlm2)                        | 7B/20B                        |    稠密LLM     |  在研版本  |
-| [Llama3.1](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/llama3_1)                          | 8B/70B                        |    稠密LLM     |  在研版本  |
-| [Llama3](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/llama3)                              | 8B/70B                        |    稠密LLM     |  在研版本  |
-| [Llama2](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/llama2.md)                   | 7B/13B/70B                    |    稠密LLM     |  在研版本  |
-| [Mixtral](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/mixtral)                            | 8x7B                          |    稀疏LLM     |  在研版本  |
-| [Qwen2](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/qwen2)                                | 0.5B/1.5B/7B/57B/57B-A14B/72B |   稠密/稀疏LLM   |  在研版本  |
-| [Qwen1.5](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/qwen1_5)                            | 7B/14B/72B                    |    稠密LLM     |  在研版本  |
-| [Qwen-VL](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/qwenvl)                             | 9.6B                          |      MM      |  在研版本  |
-| [Whisper](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/whisper.md)                 | 1.5B                          |      MM      |  在研版本  |
-| [Yi](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/yi)                                      | 6B/34B                        |    稠密LLM     |  在研版本  |
-| [Baichuan2](https://gitee.com/mindspore/mindformers/blob/r1.3.0/research/baichuan2/baichuan2.md)        | 7B/13B                        |    稠密LLM     | 1.3.2  |
-| [GLM2](https://gitee.com/mindspore/mindformers/blob/r1.3.0/docs/model_cards/glm2.md)                    | 6B                            |    稠密LLM     | 1.3.2  |
-| [GPT2](https://gitee.com/mindspore/mindformers/blob/r1.3.0/docs/model_cards/gpt2.md)                    | 124M/13B                      |    稠密LLM     | 1.3.2  |
-| [InternLM](https://gitee.com/mindspore/mindformers/blob/r1.3.0/research/internlm/internlm.md)           | 7B/20B                        |    稠密LLM     | 1.3.2  |
-| [Qwen](https://gitee.com/mindspore/mindformers/blob/r1.3.0/research/qwen/qwen.md)                       | 7B/14B                        |    稠密LLM     | 1.3.2  |
-| [CodeGeex2](https://gitee.com/mindspore/mindformers/blob/r1.1.0/docs/model_cards/codegeex2.md)          | 6B                            |    稠密LLM     | 1.1.0  |
-| [WizardCoder](https://gitee.com/mindspore/mindformers/blob/r1.1.0/research/wizardcoder/wizardcoder.md)  | 15B                           |    稠密LLM     | 1.1.0  |
-| [Baichuan](https://gitee.com/mindspore/mindformers/blob/r1.0/research/baichuan/baichuan.md)             | 7B/13B                        |    稠密LLM     |  1.0   |
-| [Blip2](https://gitee.com/mindspore/mindformers/blob/r1.0/docs/model_cards/blip2.md)                    | 8.1B                          |      MM      |  1.0   |
-| [Bloom](https://gitee.com/mindspore/mindformers/blob/r1.0/docs/model_cards/bloom.md)                    | 560M/7.1B/65B/176B            |    稠密LLM     |  1.0   |
-| [Clip](https://gitee.com/mindspore/mindformers/blob/r1.0/docs/model_cards/clip.md)                      | 149M/428M                     |      MM      |  1.0   |
-| [CodeGeex](https://gitee.com/mindspore/mindformers/blob/r1.0/research/codegeex/codegeex.md)             | 13B                           |    稠密LLM     |  1.0   |
-| [GLM](https://gitee.com/mindspore/mindformers/blob/r1.0/docs/model_cards/glm.md)                        | 6B                            |    稠密LLM     |  1.0   |
-| [iFlytekSpark](https://gitee.com/mindspore/mindformers/blob/r1.0/research/iflytekspark/iflytekspark.md) | 13B                           |    稠密LLM     |  1.0   |
-| [Llama](https://gitee.com/mindspore/mindformers/blob/r1.0/docs/model_cards/llama.md)                    | 7B/13B                        |    稠密LLM     |  1.0   |
-| [MAE](https://gitee.com/mindspore/mindformers/blob/r1.0/docs/model_cards/mae.md)                        | 86M                           |      MM      |  1.0   |
-| [Mengzi3](https://gitee.com/mindspore/mindformers/blob/r1.0/research/mengzi3/mengzi3.md)                | 13B                           |    稠密LLM     |  1.0   |
-| [PanguAlpha](https://gitee.com/mindspore/mindformers/blob/r1.0/docs/model_cards/pangualpha.md)          | 2.6B/13B                      |    稠密LLM     |  1.0   |
-| [SAM](https://gitee.com/mindspore/mindformers/blob/r1.0/docs/model_cards/sam.md)                        | 91M/308M/636M                 |      MM      |  1.0   |
-| [Skywork](https://gitee.com/mindspore/mindformers/blob/r1.0/research/skywork/skywork.md)                | 13B                           |    稠密LLM     |  1.0   |
-| [Swin](https://gitee.com/mindspore/mindformers/blob/r1.0/docs/model_cards/swin.md)                      | 88M                           |      MM      |  1.0   |
-| [T5](https://gitee.com/mindspore/mindformers/blob/r1.0/docs/model_cards/t5.md)                          | 14M/60M                       |    稠密LLM     |  1.0   |
-| [VisualGLM](https://gitee.com/mindspore/mindformers/blob/r1.0/research/visualglm/visualglm.md)          | 6B                            |      MM      |  1.0   |
-| [Ziya](https://gitee.com/mindspore/mindformers/blob/r1.0/research/ziya/ziya.md)                         | 13B                           |    稠密LLM     |  1.0   |
-| [Bert](https://gitee.com/mindspore/mindformers/blob/r0.8/docs/model_cards/bert.md)                      | 4M/110M                       |    稠密LLM     |  0.8   |
-
-&#42; ***LLM:*** *大语言模型（Large Language Model）;* ***MM:*** *多模态（Multi-Modal）*
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_zh_cn/start/overview.md b/docs/mindformers/docs/source_zh_cn/start/overview.md
deleted file mode 100644
index c0c8e1707d9be4d958bd319130a32f5202244bbf..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_zh_cn/start/overview.md
+++ /dev/null
@@ -1,15 +0,0 @@
-# 整体架构
-
-[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_zh_cn/start/overview.md)
-
-MindSpore Transformers与昇思MindSpore、昇腾Ascend的端到端AI软硬件生态，形成的整体架构如下：
-
-1. 在硬件层面，MindSpore Transformers支持用户在Ascend服务器上运行大模型；
-2. 在软件层面，MindSpore Transformers通过MindSpore提供的Python接口实现大模型相关代码，并由昇腾AI处理器配套软件包提供的算子库进行数据运算；
-3. MindSpore Transformers目前支持的基础功能特性如下：
-   1. 支持大模型[分布式并行](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/function/distributed_parallel.html)运行训练和推理等任务，并行能力包括数据并行、模型并行、超长序列并行等；
-   2. 支持[模型权重转换](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/function/weight_conversion.html)、[分布式权重切分与合并](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/function/transform_weight.html)、不同格式[数据集加载](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/function/dataset.html)以及[断点续训](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/function/resume_training.html)等功能；
-   3. 支持25+大模型[预训练](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/usage/pre_training.html)、[微调](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/usage/sft_tuning.html)、[推理](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/usage/inference.html)和[评测](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/usage/evaluation.html)等功能，同时支持对模型参数进行[量化](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/usage/quantization.html)，具体支持模型列表可参考[模型库](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/start/models.html)；
-4. MindSpore Transformers支持用户通过[MindIE](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/usage/mindie_deployment.html)进行模型服务化部署功能，同时支持使用[MindX](https://www.hiascend.com/software/mindx-dl)实现大规模集群调度；后续将支持更多第三方平台，敬请期待。
-
-![/overall_architecture](./image/overall_architecture.png)
diff --git a/docs/mindformers/docs/source_zh_cn/usage/dev_migration.md b/docs/mindformers/docs/source_zh_cn/usage/dev_migration.md
deleted file mode 100644
index c70b78d8d4648f4d94b91f3420c75b2042e953d1..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_zh_cn/usage/dev_migration.md
+++ /dev/null
@@ -1,139 +0,0 @@
-# 开发迁移
-
-[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_zh_cn/usage/dev_migration.md)
-
-本文档将指导用户如何基于MindSpore Transformers开发构建一个大模型，并完成最基本的适配，以拉起训练和推理流程。
-
-## 基于MindSpore Transformers构建大模型
-
-MindSpore Transformers中大模型的基本组成包含配置、模型、分词器（适用于大语言模型）。此外，为了使用run_mindformer.py统一脚本拉起训练或推理流程，还需要准备用于训练或推理的`YAML`配置文件。
-
-### 编写配置
-
-模型配置是一个实例，包含模型的所有信息。MindSpore Transformers中所有模型的`__init__`方法都接收一个模型配置的实例作为入参，模型的所有子模块都通过这个配置实例中所包含的信息来初始化。
-
-MindSpore Transformers提供了[PretrainedConfig](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/models/mindformers.models.PretrainedConfig.html)类，负责提供一些配置的通用方法。所有模型的配置类都应该继承于PretrainedConfig类，开发者只需关心定义所有帮助构建大模型的配置参数：Transformer类大模型通常都拥有`seq_length`、`hidden_size`、`num_layers`、`num_heads`等配置参数，文本类的大模型通常还有`vocab_size`等。
-
-可以参考MindSpore Transformers中Llama模型的配置类[LlamaConfig](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/models/mindformers.models.LlamaConfig.html)。
-
-> 如果您的模型与库内的模型非常相似，可以复用与该模型相同的配置。
-
-### 编写模型
-
-MindSpore Transformers的大模型基于MindSpore框架进行开发，其中开发者只需要关心模型网络本身的实现。
-
-MindSpore Transformers提供了[PretrainedModel](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/models/mindformers.models.PreTrainedModel.html)类，负责存储模型配置并处理加载、保存模型的方法。所有模型的类都应该继承于PretrainedModel类，并且模型的输入应该是统一的，即模型的`construct`方法的入参应该一致，具体入参和含义可以参考MindSpore Transformers中的Llama模型类[LlamaForCausalLM](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/models/mindformers.models.LlamaForCausalLM.html)。同时，模型类必须实现基类的一些抽象方法，包括：
-
-- `prepare_inputs_for_generation`：为模型推理构建输入的方法。
-- `prepare_inputs_for_predict_layout`：为分布式加载模型权重构建虚拟输入的方法。
-
-关于它们的具体含义，可以参考[LlamaForCausalLM](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/models/mindformers.models.LlamaForCausalLM.html)中的描述。
-
-> 如果您的模型结构与库内的模型非常相似，可以复用该模型的实现。
-
-### 编写分词器（适用于大语言模型）
-
-分词器（Tokenizer）的作用是处理大语言模型的输入与输出。它在大语言模型的工作流程中是必需的。
-
-MindSpore Transformers提供了[PretrainedTokenizer](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/models/mindformers.models.PreTrainedTokenizer.html)类和[PretrainedTokenizerFast](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/models/mindformers.models.PreTrainedTokenizerFast.html)类，分别是纯Python的实现和使用Rust库的实现。后者实现的区别是：
-
-- 在进行批量处理时速度显著提高；
-- 额外包含一些在文本字符串和词元空间映射的方法（例如，获取包含给定字符的词元的索引或与给定词元相对应的字符跨度）
-
-所有分词器的类应该继承于PretrainedTokenizer类或PretrainedTokenizerFast类，具体实现可以参考[LlamaTokenizer](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/models/mindformers.models.LlamaTokenizer.html)和[LlamaTokenizerFast](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/models/mindformers.models.LlamaTokenizerFast.html)。
-
-> 如果您的分词器与库内的分词器非常相似，可以复用该分词器的实现。
-
-### 准备权重和数据集
-
-如已有基于PyTorch的模型权重，可以参考[权重转换文档](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/function/weight_conversion.html)将权重转换为MindSpore格式的权重。
-
-数据集的准备可以参考[数据集文档](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/function/dataset.html)，或参考模型文档，如[Llama2说明文档——数据集准备](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/llama2.md#%E6%95%B0%E6%8D%AE%E5%8F%8A%E6%9D%83%E9%87%8D%E5%87%86%E5%A4%87)。
-
-### 准备`YAML`配置文件
-
-MindSpore Transformers使用`YAML`配置文件配置一个任务所需的所有参数，包括模型的配置参数、训练所需的配置参数（优化器、学习率、数据集等）、推理所需的配置参数（分词器等）、分布式并行的配置参数、上下文环境的配置参数等。
-
-由于自定义模型的代码不在MindSpore Transformers库内，代码中的自定义模块没有注册在MindSpore Transformers中，因而不能被自动实例化。这些代码也称为外挂代码（如`research`目录下代码）。因此需要在编写的`YAML`配置文件中的对应模块配置下添加自动注册任意模块的配置项`auto_register`，设置为要注册的API接口的相对导入路径。后续在执行run_mindformer.py脚本拉起任务时添加注册路径的入参`--register_path`，设置为外挂代码所在目录的相对路径。
-
-例如，`research`目录下的Llama3.1-8B模型的推理`YAML`配置文件[`research/llama3_1/predict_llama3_1_8b.yaml`](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/llama3_1/llama3_1_8b/predict_llama3_1_8b.yaml)中，添加了自动注册的配置项`auto_register`，以注册[`research/llama3_1/llama3_1_tokenizer.py`](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/llama3_1/llama3_1_tokenizer.py)中自定义的`Llama3Tokenizer`：
-
-```yaml
-...
-processor:
-  return_tensors: ms
-  tokenizer:
-    model_max_length: 8192
-    vocab_file: "/path/tokenizer.model"
-    pad_token: "<|reserved_special_token_0|>"
-    type: Llama3Tokenizer
-    auto_register: llama3_1_tokenizer.Llama3Tokenizer
-  type: LlamaProcessor
-...
-```
-
-其中在`tokenizer`下配置了`Llama3Tokenizer`的相对导入路径`auto_register: llama3_1_tokenizer.Llama3Tokenizer`。
-
-另外，需要在`tokenizer`下设置`vocab_file`为模型分词器`tokenizer.model`的真实路径。
-
-可以运行如下命令拉起推理任务：
-
-```bash
-python run_mindformer.py --config research/llama3_1/predict_llama3_1_8b.yaml --load_checkpoint path/to/llama3_1_8b.ckpt --register_path research/llama3_1 --predict_data "hello"
-```
-
-**参数说明**
-
-|       参数        | 说明            |
-|:---------------:|:--------------|
-|     config      | `YAML`配置文件的路径 |
-| load_checkpoint | 加载的权重路径       |
-|  register_path  | 外挂代码所在目录的路径   |
-|  predict_data   | 推理的输入数据       |
-
-其中设置了`register_path`为外挂代码所在目录的路径`research/llama3_1`，模型权重的准备参考[Llama3.1说明文档——模型权重下载](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/llama3_1/README.md#%E6%A8%A1%E5%9E%8B%E6%9D%83%E9%87%8D%E4%B8%8B%E8%BD%BD)。
-
-配置文件的详细内容及可配置项可以参考[配置文件说明](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/appendix/conf_files.html)。在实际编写配置文件时，也可以参考库内已有的配置文件，例如[Llama2-7B微调的配置文件](https://gitee.com/mindspore/mindformers/blob/r1.5.0/configs/llama2/finetune_llama2_7b.yaml)。
-
-在准备完上述所有基本要素之后，可以参考MindSpore Transformers使用教程中的其余文档进行模型训练、微调、推理等流程的实践。后续模型调试调优可以参考[大模型精度调优指南](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/acc_optimize/acc_optimize.html)和[大模型性能调优指南](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/perf_optimize/perf_optimize.html)。
-
-### 将模型贡献给MindSpore Transformers开源仓库
-
-可以参考[MindSpore Transformers贡献指南](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/faq/mindformers_contribution.html)，将模型贡献到MindSpore Transformers的开源仓库，供广大开发者研究和使用。
-
-## MindSpore Transformers大模型迁移实践
-
-### 基于Llama2-7B迁移Llama3-8B
-
-Llama3-8B与Llama2-7B拥有相同的模型结构，只有部分模型参数、分词器和权重不同。
-
-#### 模型配置
-
-以下对比了Llama2-7B和Llama3-8B的模型配置：
-
-![model_config_comparison](image/model_config_comparison.png)
-
-其中的区别有：
-
-- Llama3-8B的序列长度为8192，将`seq_length`修改为`8192`。
-- Llama3-8B使用GQA，每个key-value组的head数量为8，设置`n_kv_head`为`8`。
-- Llama3-8B的词表大小为128256，将`vocab_size`修改为`128256`。
-- Llama3-8B扩充了Feed-Forward Network的隐藏层大小至14336，设置`intermediate_size`为`14336`。
-- Llama3-8B修改了特殊词元索引，修改`bos_token_id`为`128000`、`eos_token_id`为`128001`、`pad_token_id`为`128002`。
-- Llama3-8B修改了旋转位置编码中的theta值为500000，修改`theta`为`500000`。
-
-修改Llama2-7B的`YAML`配置文件中的对应内容即可得到[Llama3-8B的配置文件](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/llama3/llama3_8b/finetune_llama3_8b.yaml)。
-
-#### 分词器
-
-Llama3-8B重新实现了分词器。对照官方的实现，继承MindSpore Transformers中的PretrainedTokenizer实现Llama3Tokenizer，编写在[llama3_tokenizer.py](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/llama3/llama3_tokenizer.py)中。
-
-#### 权重转换
-
-Llama3-8B的参数命名和Llama2-7B一致，因此可以复用Llama2-7B的权重转换流程，参考[Llama3文档的权重转换章节](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/llama3/README.md#%E6%A8%A1%E5%9E%8B%E6%9D%83%E9%87%8D%E8%BD%AC%E6%8D%A2)。
-
-#### 数据集处理
-
-由于Llama3-8B的分词器与Llama2-7B不同，因此Llama3-8B需要在Llama2-7B的数据集处理脚本的基础上，替换Llama3-8B的分词器对数据进行预处理，参考[conversation.py](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/llama3/llama3_conversation.py)和[llama_preprocess.py](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/llama3/llama3_preprocess.py)。
-
-关于MindSpore Transformers中Llama3的具体实现，可以参考MindSpore Transformers仓库中[Llama3的文件夹](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/llama3)。关于MindSpore Transformers中Llama3的使用，可以参考[LLama3的说明文档](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/llama3/README.md)。
diff --git a/docs/mindformers/docs/source_zh_cn/usage/evaluation.md b/docs/mindformers/docs/source_zh_cn/usage/evaluation.md
deleted file mode 100644
index b70b9bc0d313d21fb5ddc9a086dfd9472c433d4f..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_zh_cn/usage/evaluation.md
+++ /dev/null
@@ -1,540 +0,0 @@
-# 评测
-
-[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_zh_cn/usage/evaluation.md)
-
-## Harness评测
-
-### 基本介绍
-
-[LM Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness)是一个开源语言模型评测框架，提供60多种标准学术数据集的评测，支持HuggingFace模型评测、PEFT适配器评测、vLLM推理评测等多种评测方式，支持自定义prompt和评测指标，包含loglikelihood、generate_until、loglikelihood_rolling三种类型的评测任务。基于Harness评测框架对MindSpore Transformers进行适配后，支持加载MindSpore Transformers模型进行评测。
-
-目前已验证过的模型和支持的评测任务如下表所示（其余模型和评测任务正在积极验证和适配中，请关注版本更新）：
-
-| 已验证的模型   | 支持的评测任务                |
-|----------|------------------------|
-| Llama3   | gsm8k、ceval-valid、mmlu、cmmlu、race、lambada |
-| Llama3.1 | gsm8k、ceval-valid、mmlu、cmmlu、race、lambada |
-| Qwen2    | gsm8k、ceval-valid、mmlu、cmmlu、race、lambada |
-
-### 安装
-
-Harness支持pip安装和源码编译安装两种方式。pip安装更简单快捷，源码编译安装更便于调试分析，用户可以根据需要选择合适的安装方式。
-
-#### pip安装
-
-用户可以执行如下命令安装Harness（推荐使用0.4.4版本）：
-
-```shell
-pip install lm_eval==0.4.4
-```
-
-#### 源码编译安装
-
-用户可以执行如下命令编译并安装Harness：
-
-```bash
-git clone --depth 1 -b v0.4.4 https://github.com/EleutherAI/lm-evaluation-harness
-cd lm-evaluation-harness
-pip install -e .
-```
-
-### 使用方式
-
-#### 评测前准备
-
-  1. 创建一个新目录，例如名称为`model_dir`，用于存储模型yaml文件。
-  2. 在上个步骤创建的目录中，放置模型推理yaml配置文件（predict_xxx_.yaml）。不同模型的推理yaml配置文件所在目录位置，请参考[模型库](../start/models.md)。
-  3. 配置yaml文件。如果yaml中模型类、模型Config类、模型Tokenzier类使用了外挂代码，即代码文件在[research](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research)目录或其他外部目录下，需要修改yaml文件：在相应类的`type`字段下，添加`auto_register`字段，格式为“module.class”（其中“module”为类所在脚本的文件名，“class”为类名。如果已存在，则不需要修改）。
-
-      以[predict_llama3_1_8b.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/llama3_1/llama3_1_8b/predict_llama3_1_8b.yaml)配置为例，对其中的部分配置项进行如下修改：
-
-        ```yaml
-        run_mode: 'predict'       # 设置推理模式
-        load_checkpoint: 'model.ckpt'   # 权重路径
-        processor:
-          tokenizer:
-            vocab_file: "tokenizer.model"     # tokenizer路径
-            type: Llama3Tokenizer
-            auto_register: llama3_tokenizer.Llama3Tokenizer
-        ```
-
-        关于每个配置项的详细说明请参考[配置文件说明](../appendix/conf_files.md)。
-  4. 如果使用`ceval-valid`、`mmlu`、`cmmlu`、`race`、`lambada`数据集进行评测，需要将`use_flash_attention`设置为`False`，以`predict_llama3_1_8b.yaml`为例，修改yaml如下：
-
-       ```yaml
-       model:
-         model_config:
-           # ...
-           use_flash_attention: False  # 设置为False
-           # ...
-       ```
-
-#### 评测样例
-
-执行脚本[run_harness.sh](https://gitee.com/mindspore/mindformers/blob/r1.5.0/toolkit/benchmarks/run_harness.sh)进行评测。
-
-run_harness.sh脚本参数配置如下表：
-
-| 参数               | 类型  | 参数介绍                                                                                           | 是否必须 |
-|------------------|-----|------------------------------------------------------------------------------------------------|------|
-| `--register_path`| str | 外挂代码所在目录的绝对路径。比如[research](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research)目录下的模型目录 | 否（外挂代码必填）    |
-| `--model`        | str | 需设置为 `mf` ，对应为MindSpore Transformers评估策略                                                                  | 是    |
-| `--model_args`   | str | 模型及评估相关参数，见下方模型参数介绍                                                                            | 是    |
-| `--tasks`        | str | 数据集名称。可传入多个数据集，使用逗号（，）分隔                                                                         | 是    |
-| `--batch_size`   | int | 批处理样本数                                                                                         | 否    |
-
-其中，model_args参数配置如下表：
-
-| 参数             | 类型      | 参数介绍               | 是否必须 |
-|----------------|---------|--------------------|------|
-| `pretrained`   | str     | 模型目录路径             | 是    |
-| `max_length`   | int     | 模型生成的最大长度          | 否    |
-| `use_parallel` | bool | 开启并行策略(执行多卡评测必须开启) | 否    |
-| `tp`           | int     | 张量并行数               | 否    |
-| `dp`           | int     | 数据并行数               | 否    |
-
-Harness评测支持单机单卡、单机多卡、多机多卡场景，每种场景的评测样例如下：
-
-1. 单卡评测样例
-
-   ```shell
-      source toolkit/benchmarks/run_harness.sh \
-       --register_path mindformers/research/llama3_1 \
-       --model mf \
-       --model_args pretrained=model_dir \
-       --tasks gsm8k
-   ```
-
-2. 多卡评测样例
-
-   ```shell
-      source toolkit/benchmarks/run_harness.sh \
-       --register_path mindformers/research/llama3_1 \
-       --model mf \
-       --model_args pretrained=model_dir,use_parallel=True,tp=4,dp=1 \
-       --tasks ceval-valid \
-       --batch_size BATCH_SIZE WORKER_NUM
-   ```
-
-   - `BATCH_SIZE`为模型批处理样本数；
-   - `WORKER_NUM`为使用计算卡的总数。
-
-3. 多机多卡评测样例
-
-   节点0（主节点）命令：
-
-   ```shell
-      source toolkit/benchmarks/run_harness.sh \
-       --register_path mindformers/research/llama3_1 \
-       --model mf \
-       --model_args pretrained=model_dir,use_parallel=True,tp=8,dp=1 \
-       --tasks lambada \
-       --batch_size 2 8 4 192.168.0.0 8118 0 output/msrun_log False 300
-   ```
-
-   节点1（副节点）命令：
-
-   ```shell
-      source toolkit/benchmarks/run_harness.sh \
-       --register_path mindformers/research/llama3_1 \
-       --model mf \
-       --model_args pretrained=model_dir,use_parallel=True,tp=8,dp=1 \
-       --tasks lambada \
-       --batch_size 2 8 4 192.168.0.0 8118 1 output/msrun_log False 300
-   ```
-
-   节点n（副节点）命令：
-
-   ```shell
-      source toolkit/benchmarks/run_harness.sh \
-       --register_path mindformers/research/llama3_1 \
-       --model mf \
-       --model_args pretrained=model_dir,use_parallel=True,tp=8,dp=1 \
-       --tasks lambada \
-       --batch_size BATCH_SIZE WORKER_NUM LOCAL_WORKER MASTER_ADDR MASTER_PORT NODE_RANK output/msrun_log False CLUSTER_TIME_OUT
-   ```
-
-   - `BATCH_SIZE`为模型批处理样本数；
-   - `WORKER_NUM`为所有节点中使用计算卡的总数；
-   - `LOCAL_WORKER`为当前节点中使用计算卡的数量；
-   - `MASTER_ADDR`为分布式启动主节点的ip；
-   - `MASTER_PORT`为分布式启动绑定的端口号；
-   - `NODE_RANK`为当前节点的rank id；
-   - `CLUSTER_TIME_OUT`为分布式启动的等待时间，单位为秒。
-
-   多机多卡评测需要分别在不同节点运行脚本，并将参数MASTER_ADDR设置为主节点的ip地址， 所有节点设置的ip地址相同，不同节点之间仅参数NODE_RANK不同。
-
-### 查看评测结果
-
-执行评测命令后，评测结果将会在终端打印出来。以gsm8k为例，评测结果如下，其中Filter对应匹配模型输出结果的方式，n-shot对应数据集内容格式，Metric对应评测指标，Value对应评测分数，Stderr对应分数误差。
-
-| Tasks | Version | Filter           | n-shot | Metric      |   | Value  |   | Stderr |
-|-------|--------:|------------------|-------:|-------------|---|--------|---|--------|
-| gsm8k |       3 | flexible-extract |      5 | exact_match | ↑ | 0.5034 | ± | 0.0138 |
-|       |         | strict-match     |      5 | exact_match | ↑ | 0.5011 | ± | 0.0138 |
-
-## VLMEvalKit评测
-
-### 基本介绍
-
-[VLMEvalKit](https://github.com/open-compass/VLMEvalKit)
-是一款专为大型视觉语言模型评测而设计的开源工具包，支持在各种基准测试上对大型视觉语言模型进行一键评估，无需进行繁重的数据准备工作，让评估过程更加简便。它支持多种图文多模态评测集和视频多模态评测集，支持多种API模型以及基于PyTorch和HF的开源模型，支持自定义prompt和评测指标。基于VLMEvalKit评测框架对MindSpore Transformers进行适配后，支持加载MindSpore Transformers中多模态大模型进行评测。
-
-目前已适配的模型和支持的评测数据集如下表所示（其余模型和评测数据集正在积极适配中，请关注版本更新）：
-
-| 适配的模型 | 支持的评测任务                                           |
-|--|---------------------------------------------------|
-| cogvlm2-image-llama3-chat | MME、MMBench、COCO Caption、MMMU_DEV_VAL、TextVQA_VAL |
-| cogvlm2-video-llama3-chat | MMBench-Video、MVBench                             |
-
-### 支持特性说明
-
-1. 支持自动下载评测数据集；
-2. 一键生成评测结果。
-
-### 安装
-
-#### 下载代码并编译，安装依赖包
-
-1. 下载并修改代码：由于开源框架在跑MVBench数据集时存在已知问题，所以需要使用导入patch补丁的方式修改源码。获取[eval.patch](https://github.com/user-attachments/files/17956417/eval.patch)，下载放入本地目录中。导入patch时要使用patch文件的绝对路径。
-
-    执行以下命令：
-
-    ```bash
-    git clone https://github.com/open-compass/VLMEvalKit.git
-    cd VLMEvalKit
-    git checkout 78a8cef3f02f85734d88d534390ef93ecc4b8bed
-    git apply /path/to/eval.patch
-    ```
-
-2. 安装依赖包
-
-    在下载好的代码中，找到requirements.txt（VLMEvalKit/requirements.txt）文件，修改成如下内容：
-
-    ```txt
-    gradio==4.40.0
-    huggingface_hub==0.24.2
-    imageio==2.35.1
-    matplotlib==3.9.1
-    moviepy==1.0.3
-    numpy==1.26.4
-    omegaconf==2.3.0
-    openai==1.3.5
-    opencv-python==4.10.0.84
-    openpyxl==3.1.5
-    pandas==2.2.2
-    peft==0.12.0
-    pillow==10.4.0
-    portalocker==2.10.1
-    protobuf==5.27.2
-    python-dotenv==1.0.1
-    requests==2.32.3
-    rich==13.7.1
-    sentencepiece==0.2.0
-    setuptools==69.5.1
-    sty==1.0.6
-    tabulate==0.9.0
-    tiktoken==0.7.0
-    timeout-decorator==0.5.0
-    torch==2.5.1
-    tqdm==4.66.4
-    transformers==4.43.3
-    typing_extensions==4.12.2
-    validators==0.33.0
-    xlsxwriter==3.2.0
-    torchvision==0.20.1
-    ```
-
-    执行命令：
-
-    ```bash
-    pip install -r requirements.txt
-    ```
-
-#### 安装FFmpeg
-
-Ubuntu系统按照如下步骤安装：
-
-1. 更新系统包列表，安装编译FFmpeg所需的系统依赖库。
-
-      ```bash
-      apt-get update
-      apt-get -y install autoconf automake build-essential libass-dev libfreetype6-dev libsdl2-dev libtheora-dev libtool libva-dev libvdpau-dev libvorbis-dev libxcb1-dev libxcb-shm0-dev libxcb-xfixes0-dev pkg-config texinfo zlib1g-dev yasm libx264-dev libfdk-aac-dev libmp3lame-dev libopus-dev libvpx-dev
-      ```
-
-2. 从FFmpeg官网下载FFmpeg4.1.11的源码压缩包，解压源码包并进入解压后的目录；配置FFmpeg的编译选项：指定FFmpeg的安装路径（绝对路径），生成共享库，启用对特定编解码器的支持，启用非自由和GPL许可的功能；编译并安装FFmpeg。
-
-      ```bash
-      wget --no-check-certificate https://www.ffmpeg.org/releases/ffmpeg-4.1.11.tar.gz
-      tar -zxvf ffmpeg-4.1.11.tar.gz
-      cd ffmpeg-4.1.11
-      ./configure --prefix=/{path}/ffmpeg-xxx --enable-shared --enable-libx264 --enable-libfdk-aac --enable-libmp3lame --enable-libopus --enable-libvpx --enable-nonfree --enable-gpl
-      make && make install
-      ```
-
-OpenEuler系统按照如下步骤安装：
-
-1. 从FFmpeg官网下载FFmpeg4.1.11的源码压缩包，解压源码包并进入解压后的目录；配置FFmpeg的编译选项：指定FFmpeg的安装路径（绝对路径）；编译并安装FFmpeg。
-
-      ```bash
-      wget --no-check-certificate https://www.ffmpeg.org/releases/ffmpeg-4.1.11.tar.gz
-      tar -zxvf ffmpeg-4.1.11.tar.gz
-      cd ffmpeg-4.1.11
-      ./configure --enable-shared --disable-x86asm --prefix=/path/to/ffmpeg
-      make && make install
-      ```
-
-2. 配置环境变量，`FFMPEG_PATH`需要指定安装FFmpeg的绝对路径，以便系统能够正确找到和使用FFmpeg及其相关库。
-
-      ```bash
-      vi ~/.bashrc
-      export FFMPEG_PATH=/path/to/ffmpeg/
-      export LD_LIBRARY_PATH=$FFMPEG_PATH/lib:$LD_LIBRARY_PATH
-      source ~/.bashrc
-      ```
-
-#### 安装Decord
-
-Ubuntu系统按照如下步骤安装：
-
-1. 拉取Decord代码，进入`decord`目录，执行以下命令：
-
-      ```bash
-      git clone --recursive -b v0.6.0 https://github.com/dmlc/decord.git
-      cd decord
-      ```
-
-2. 创建并进入`build`目录，配置Decord的编译选项，禁用CUDA支持，启用Release模式（优化性能），指定FFmpeg的安装路径，编译Decord库。将编译生成的libdecord.so库文件复制到系统库目录，复制到`decord`的`python`目录。
-
-      ```bash
-      mkdir build
-      cd build
-      cmake .. -DUSE_CUDA=0 -DCMAKE_BUILD_TYPE=Release -DFFMPEG_DIR=/{path}/ffmpeg-4.1.11 && make
-      cp libdecord.so /usr/local/lib/
-      cp libdecord.so ../python/decord/libdecord.so
-      ```
-
-3. 进入`decord`目录中的`python`文件夹，安装`numpy`依赖项，安装Decord的python包。将FFmpeg的库路径（绝对路径）添加到`LD_LIBRARY_PATH`环境变量中，确保运行时能够找到FFmpeg的共享库。
-
-      ```bash
-      cd /path/to/decord/python
-      pip install numpy
-      python setup.py install
-      export LD_LIBRARY_PATH=/path/to/ffmpeg-4.1.11/lib/:$LD_LIBRARY_PATH
-      ```
-
-4. 执行Python命令，测试Decord是否安装成功，没有报错即为安装成功。
-
-      ```bash
-      python -c "import decord; from decord import VideoReader"
-      ```
-
-OpenEuler系统按照如下步骤安装：
-
-1. 拉取Decord代码，进入`decord`目录。
-
-      ```bash
-      git clone --recursive -b v0.6.0 https://github.com/dmlc/decord
-      cd decord
-      ```
-
-2. 创建并进入`build`目录，配置Decord的编译选项，指定FFmpeg的安装路径(绝对路径)，编译Decord库；进入`decord`目录中的python文件夹，配置环境变量，指定`PYTHONPATH`；安装Decord的python包。
-
-      ```bash
-      mkdir build && cd build
-      cmake -DFFMPEG_DIR=/path/ffmpeg-4.1.11 ..
-      make
-      cd ../python
-      pwd=$PWD
-      echo "PYTHONPATH=$PYTHONPATH:$pwd" >> ~/.bashrc
-      source ~/.bashrc
-      python3 setup.py install
-         ```
-
-3. 执行python命令，测试Decord是否安装成功，没有报错即为安装成功。
-
-      ```bash
-      python -c "import decord; from decord import VideoReader"
-      ```
-
-### 评测
-
-#### 评测前准备
-
-1. 创建一个新目录，例如名称为`model_dir`，用于存储模型yaml文件；
-2. 在上个步骤创建的目录中放置模型推理yaml配置文件（predict_xxx_.yaml），不同模型的推理yaml配置文件的目录位置参考[模型库](../start/models.md)各模型说明文档中的模型文件树；
-3. 配置yaml配置文件。
-
-    以[predict_cogvlm2_image_llama3_chat_19b.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/configs/cogvlm2/predict_cogvlm2_image_llama3_chat_19b.yaml)配置为例：
-
-    ```yaml
-    load_checkpoint: "/{path}/model.ckpt"  # 指定权重文件路径
-    model:
-      model_config:
-        use_past: True                         # 开启增量推理
-        is_dynamic: False                       # 关闭动态shape
-
-      tokenizer:
-        vocab_file: "/{path}/tokenizer.model"  # 指定tokenizer文件路径
-    ```
-
-   配置yaml文件，参考[配置文件说明](../appendix/conf_files.md)。
-4. MMbench-Video数据集评测需要使用GPT-4 Turbo模型进行评测打分，请提前准备好相应的API Key，并放在VLMEvalKit/.env文件中，内容如下所示：
-
-   ```text
-   OPENAI_API_KEY=your_apikey
-   ```
-
-5. MVBench数据集评测开始时，如果提示需要输入HuggingFace密钥，请按提示输入，保证后续评测的正常执行。
-
-#### 拉起评测任务
-
-在MindSpore Transformers本地代码仓根目录下执行脚本：[run_vlmevalkit.sh](https://gitee.com/mindspore/mindformers/blob/r1.5.0/toolkit/benchmarks/run_vlmevalkit.sh)。
-
-执行如下命令拉起评测任务：
-
-```shell
-#!/bin/bash
-
-source toolkit/benchmarks/run_vlmevalkit.sh \
- --data MMMU_DEV_VAL \
- --model cogvlm2-image-llama3-chat \
- --verbose \
- --work_dir /path/to/cogvlm2-image-eval-result \
- --model_path model_dir
-```
-
-### 评测参数
-
-| 参数                | 类型  | 参数介绍                                                                                           | 是否必须      |
-|-------------------|-----|------------------------------------------------------------------------------------------------|-----------|
-| `--data`          | str | 数据集名称，可传入多个数据集，空格分割。                                                                           | 是         |
-| `--model`         | str | 模型名称。                                                                                          | 是         |
-| `--verbose`       | /   | 输出评测运行过程中的日志。                                                                                  | 否         |
-| `--work_dir`      | str | 存放评测结果的目录，默认存储在当前执行目录的`outputs`文件夹下。                                                           | 否         |
-| `--model_path`    | str | 包含配置文件的文件夹路径。                                                                                  | 是         |
-| `--register_path` | str | 外挂代码所在目录的绝对路径。比如[research](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research)目录下的模型目录。 | 否（外挂代码必填） |
-
-如果因网络限制，服务器不支持在线下载图文数据集时，可以将本地下载好的以.tsv结尾的数据集文件上传至服务器~/LMUData目录下，进行离线评测。（例如：~/LMUData/MME.tsv 或 ~/LMUData/MMBench_DEV_EN.tsv 或 ~/LMUData/COCO_VAL.tsv）
-
-### 查看评测结果
-
-按照上述方式评估后，在存储评测结果的目录中，找到以.json或以.csv结尾的文件查看评估的结果。
-
-评测样例结果如下，其中`Bleu`和`ROUGE_L`表示评估翻译质量的指标，`CIDEr`表示评估图像描述任务的指标。
-
-```json
-{
-   "Bleu": [
-      15.523950970070652,
-      8.971141548228058,
-      4.702477458554666,
-      2.486860744700995
-   ],
-   "ROUGE_L": 15.575063213115946,
-   "CIDEr": 0.01734615519604295
-}
-```
-
-## 使用VideoBench数据集进行模型评测
-
-### 基本介绍
-
-[Video-Bench](https://github.com/PKU-YuanGroup/Video-Bench/tree/main) 是首个针对 Video-LLM 的综合评估基准，具有三级能力评估，可以系统地评估模型在视频专属理解、先验知识融入和基于视频的决策能力方面的表现。
-
-### 评测前准备
-
-1. 数据集下载
-
-    下载[Video-Bench中的视频数据](https://huggingface.co/datasets/LanguageBind/Video-Bench)，解压后按照如下目录格式进行放置：
-
-    ```text
-    egs/VideoBench/
-      └── Eval_video
-            ├── ActivityNet
-            │     ├── v__2txWbQfJrY.mp4
-            │     ...
-            ├── Driving-decision-making
-            │     ├── 1.mp4
-            │     ...
-            ...
-    ```
-
-2. 文本下载
-
-    下载[Video-Bench中的文本数据](https://github.com/PKU-YuanGroup/Video-Bench/tree/main?tab=readme-ov-file)，解压后按照如下目录格式进行放置：
-
-    ```text
-    egs/Video-Bench/
-      └── Eval_QA
-            ├── Youcook2_QA_new.json等json文件
-            ...
-    ```
-
-3. 所有问题的正确答案下载
-
-    下载[Video-Bench中的答案数据](https://huggingface.co/spaces/LanguageBind/Video-Bench/resolve/main/file/ANSWER.json)。
-
-> 注：Video-Bench中的文本数据按照“egs/VideoBench/Eval_QA”（目录至少两层，且最后一层是`Eval_QA`）的路径格式进行存储；Video-Bench中的视频数据按照“egs/VideoBench/Eval_video”（目录至少两层，且最后一层是`Eval_video`）的路径格式进行存储。
-
-### 评测
-
-执行脚本路径可参考链接：[eval_with_videobench.py](https://gitee.com/mindspore/mindformers/blob/r1.5.0/toolkit/benchmarks/eval_with_videobench.py)。
-
-#### 执行推理脚本，获取推理结果
-
-```shell
-python toolkit/benchmarks/eval_with_videobench.py \
---model_path model_path \
---dataset_name dataset_name \
---Eval_QA_root Eval_QA_root \
---Eval_Video_root Eval_Video_root \
---chat_conversation_output_folder output
-```
-
-> 参数`Eval_QA_root`填写Eval_QA的上一层目录；参数`Eval_Video_root`填写Eval_video的上一层目录。
-
-**参数说明**
-
-| **参数**                             | **是否必选** | **说明**                                     |
-|------------------------------------|---------|--------------------------------------------|
-| `--model_path`                     | 是       | 存储模型相关文件的文件夹路径，包含模型配置文件及模型词表文件。            |
-| `--dataset_name`                   | 否       | 评测数据子集名称，默认为None，评测VideoBench的所有子集。        |
-| `--Eval_QA_root`                   | 是       | 存放VideoBench数据集的json文件目录。 |
-| `--Eval_Video_root`                | 是       | 存放VideoBench数据集的视频文件目录。                    |
-| `--chat_conversation_output_folder` | 否       | 生成结果文件的目录。默认存放在当前目录的Chat_results文件夹下。      |
-
-运行结束后，在chat_conversation_output_folder目录下会生成对话结果文件。
-
-#### 根据生成结果进行评测打分
-
-Video-Bench可以根据模型生成的答案利用ChatGPT或T5进行评估，最终得到13个数据子集的最终分数。
-
-例如：使用ChatGPT进行评估打分：
-
-```shell
-python Step2_chatgpt_judge.py \
---model_chat_files_folder ./Chat_results \
---apikey sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx \
---chatgpt_judge_output_folder ./ChatGPT_Judge
-
-python Step3_merge_into_one_json.py \
---chatgpt_judge_files_folder ./ChatGPT_Judge \
---merge_file ./Video_Bench_Input.json
-```
-
-上述评测打分命令中的脚本路径为：[Step2_chatgpt_judge.py](https://github.com/PKU-YuanGroup/Video-Bench/blob/main/Step2_chatgpt_judge.py)、[Step3_merge_into_one_json.py](https://github.com/PKU-YuanGroup/Video-Bench/blob/main/Step3_merge_into_one_json.py)
-
-ChatGPT可能会将部分问题的回答视为格式错误，因此需要多次运行Step2_chatgpt_judge.py以确保每个问题都由ChatGPT进行验证。
-
-## FAQ
-
-1. 使用Harness或VLMEvalKit进行评测，在加载HuggingFace数据集时，报错`SSLError`：
-
-   参考[SSL Error报错解决方案](https://stackoverflow.com/questions/71692354/facing-ssl-error-with-huggingface-pretrained-models)。
-
-   注意：关闭SSL校验存在风险，可能暴露在中间人攻击（MITM）下。仅建议在测试环境或你完全信任的连接里使用。
-
-2. 使用VLMEvalKit中的MVBench数据集进行评测，出现`AssertionError`：
-
-   由于开源框架`VLMEvalKit`在跑`MVBench`数据集时存在已知问题，请参考开源框架的[issue](https://github.com/open-compass/VLMEvalKit/issues/888)进行修改，或删除评测过程中产生的文件（由参数`--work_dir`指定，默认在当前执行目录的`outputs`文件夹）重新执行。
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_zh_cn/usage/image/model_config_comparison.png b/docs/mindformers/docs/source_zh_cn/usage/image/model_config_comparison.png
deleted file mode 100644
index be52b9c4ee18c3db7662ffa0f23d01861be8a250..0000000000000000000000000000000000000000
Binary files a/docs/mindformers/docs/source_zh_cn/usage/image/model_config_comparison.png and /dev/null differ
diff --git a/docs/mindformers/docs/source_zh_cn/usage/image/multi_modal.png b/docs/mindformers/docs/source_zh_cn/usage/image/multi_modal.png
deleted file mode 100644
index 9ad095bf884e6f052deea77f0bd4725284fede2d..0000000000000000000000000000000000000000
Binary files a/docs/mindformers/docs/source_zh_cn/usage/image/multi_modal.png and /dev/null differ
diff --git a/docs/mindformers/docs/source_zh_cn/usage/inference.md b/docs/mindformers/docs/source_zh_cn/usage/inference.md
deleted file mode 100644
index e6d0d863d3012bed4ecd15133bcb6d1929e4fe60..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_zh_cn/usage/inference.md
+++ /dev/null
@@ -1,361 +0,0 @@
-# 推理
-
-[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_zh_cn/usage/inference.md)
-
-## 概述
-
-MindSpore Transformers 提供了大模型推理能力，用户可以执行 `run_mindformer` 统一脚本，或者编写代码调用高阶接口进行推理。使用 `run_mindformer` 统一脚本可以不编写代码，直接通过配置文件启动，用法更便捷。
-
-## 基本流程
-
-推理流程可以分解成以下几个步骤：
-
-### 1. 选择推理的模型
-
-根据需要的推理任务，选择不同的模型，如文本生成可以选择 Llama2 等。
-
-### 2. 准备模型权重
-
-模型权重可分为完整权重和分布式权重两种，使用时需参考以下说明。
-
-#### 2.1 完整权重
-
-完整权重可以通过以下两种方式获得：
-
-1. 从HuggingFace模型库中下载相应模型的开源权重后，参考[权重格式转换](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/function/weight_conversion.html)将其转换为ckpt格式。
-2. 预训练或者微调后的分布式权重，通过[合并](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/function/transform_weight.html)生成一个完整权重。
-
-#### 2.2 分布式权重
-
-分布式权重一般通过预训练或者微调后获得，默认保存在`./output/checkpoint_network`目录，需要先转换为单卡或多卡权重，再进行单卡或多卡推理。
-
-如果推理使用的权重切分方式，与推理任务中提供的模型切分方式不同，例如以下这几种情况，则需要额外对权重进行切分方式的转换，以匹配实际推理任务中模型的切分方式。
-
-1. 多卡训练得到的权重在单卡上推理；
-2. 8卡训练的权重在2卡上推理；
-3. 已经切分好的分布式权重在单卡上推理等。
-
-下文的命令示例均采用了在线自动切分的方式，通过设置参数 `--auto_trans_ckpt` 为 `True` 和 `--src_strategy_path_or_dir` 为权重的切分策略文件或目录路径（预训练或者微调后，默认保存在`./output/strategy`下）在推理任务中自动完成切分。更多用法可参考[分布式权重的合并和切分](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/function/transform_weight.html)。
-
-> 由于训练和推理任务都使用 `./output` 作为默认输出路径，当使用训练任务所输出的策略文件，作为推理任务的源权重策略文件时，需要将默认输出路径下的策略文件目录移动到其他位置，避免被推理任务的进程清空，如：
->
-> ```mv ./output/strategy/ ./strategy```
-
-### 3. 执行推理任务
-
-使用 `run_mindformer` 统一脚本或调用高阶接口执行推理任务。
-
-## 使用 run_mindformer 一键启动脚本推理
-
-单卡推理可以直接执行[run_mindformer.py](https://gitee.com/mindspore/mindformers/blob/r1.5.0/run_mindformer.py)脚本，多卡推理需要借助[scripts/msrun_launcher.sh](https://gitee.com/mindspore/mindformers/blob/r1.5.0/scripts/msrun_launcher.sh)来启动。
-
-run_mindformer.py的参数说明如下：
-
-| 参数                     | 参数说明                                         |
-| :----------------------- |:---------------------------------------------|
-| config                   | yaml配置文件的路径                                  |
-| run_mode                 | 运行的模式，推理设置为predict                           |
-| use_parallel             | 是否使用多卡推理                                     |
-| load_checkpoint          | 加载的权重路径                                      |
-| predict_data             | 推理的输入数据，多batch推理时需要传入输入数据的txt文件路径，包含多行输入     |
-| auto_trans_ckpt          | 自动权重切分，默认值为False                             |
-| src_strategy_path_or_dir | 权重的策略文件路径                                    |
-| predict_batch_size       | 多batch推理的batch_size大小                        |
-| modal_type               | 多模态推理场景下，模型推理输入对应模态，图片路径对应'image'，文本对应'text' |
-
-msrun_launcher.sh包括run_mindformer.py命令和推理卡数两个参数。
-
-下面将以 Llama2 为例介绍单卡和多卡推理的用法，推荐配置为[predict_llama2_7b.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/configs/llama2/predict_llama2_7b.yaml)文件。
-
-> 推理时会自动下载Llama2模型所需的词表文件 `tokenizer.model` （需要保障网络畅通）。如果本地有这个文件，可以提前把它放在 `./checkpoint_download/llama2/` 目录下。
-
-### 单卡推理
-
-当使用完整权重推理时，执行以下命令即可启动推理任务：
-
-```shell
-python run_mindformer.py \
---config configs/llama2/predict_llama2_7b.yaml \
---run_mode predict \
---use_parallel False \
---load_checkpoint path/to/checkpoint.ckpt \
---predict_data 'I love Beijing, because'
-```
-
-当使用分布式权重推理时，需要增加 ``--auto_trans_ckpt`` 和 ``--src_strategy_path_or_dir`` 的入参，启动命令如下：
-
-```shell
-python run_mindformer.py \
---config configs/llama2/predict_llama2_7b.yaml \
---run_mode predict \
---use_parallel False \
---auto_trans_ckpt True \
---src_strategy_path_or_dir ./strategy \
---load_checkpoint path/to/checkpoint_dir \
---predict_data 'I love Beijing, because'
-```
-
-出现如下结果，证明推理成功。推理结果也会保存到当前目录下的 `text_generation_result.txt` 文件中。详细日志可通过`./output/msrun_log` 目录查看。
-
-```text
-'text_generation_text': [I love Beijing, because it is a city that is constantly constantly changing. I have been living here for ......]
-```
-
-### 多卡推理
-
-多卡推理的配置要求与单卡存在差异，需参考如下说明修改[predict_llama2_7b.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/configs/llama2/predict_llama2_7b.yaml)配置。
-
-1. 模型并行model_parallel的配置和使用的卡数需保持一致，下文用例为2卡推理，需将model_parallel设置成2；
-2. 当前版本的多卡推理不支持数据并行，需将data_parallel设置为1。
-
-**修改前的配置：**
-
-```yaml
-parallel_config:
-  data_parallel: 8
-  model_parallel: 1
-  pipeline_stage: 1
-```
-
-**修改后的配置：**
-
-```yaml
-parallel_config:
-  data_parallel: 1
-  model_parallel: 2
-  pipeline_stage: 1
-```
-
-当使用完整权重推理时，需要开启在线切分方式加载权重，参考以下命令：
-
-```shell
-bash scripts/msrun_launcher.sh "python run_mindformer.py \
---config configs/llama2/predict_llama2_7b.yaml \
---run_mode predict \
---use_parallel True \
---auto_trans_ckpt True \
---load_checkpoint path/to/checkpoint.ckpt \
---predict_data 'I love Beijing, because'" \
-2
-```
-
-当使用分布式权重推理，且权重的切分策略与模型的切分策略一致时，参考以下命令：
-
-```shell
-bash scripts/msrun_launcher.sh "python run_mindformer.py \
---config configs/llama2/predict_llama2_7b.yaml \
---run_mode predict \
---use_parallel True \
---load_checkpoint path/to/checkpoint_dir \
---predict_data 'I love Beijing, because'" \
-2
-```
-
-当使用分布式权重推理，且权重的切分策略与模型的切分策略不一致时，需要打开在线切分功能加载权重，参考以下命令：
-
-```shell
-bash scripts/msrun_launcher.sh "python run_mindformer.py \
---config configs/llama2/predict_llama2_7b.yaml \
---run_mode predict \
---use_parallel True \
---auto_trans_ckpt True \
---src_strategy_path_or_dir ./strategy \
---load_checkpoint path/to/checkpoint_dir \
---predict_data 'I love Beijing, because'" \
-2
-```
-
-推理结果查看方式，与单卡推理相同。
-
-### 多卡多batch推理
-
-多卡多batch推理的启动方式可参考上述[多卡推理](#多卡推理)，但是需要增加`predict_batch_size`的入参，并修改`predict_data`的入参。
-
-`input_predict_data.txt`文件的内容和格式是每一行都是一个输入，问题的个数与`predict_batch_size`一致，可以参考以下格式：
-
-```txt
-I love Beijing, because
-I love Beijing, because
-I love Beijing, because
-I love Beijing, because
-```
-
-以完整权重推理为例，可以参考以下命令启动推理任务：
-
-```shell
-bash scripts/msrun_launcher.sh "python run_mindformer.py \
---config configs/llama2/predict_llama2_7b.yaml \
---run_mode predict \
---predict_batch_size 4 \
---use_parallel True \
---auto_trans_ckpt True \
---load_checkpoint path/to/checkpoint.ckpt \
---predict_data path/to/input_predict_data.txt" \
-2
-```
-
-推理结果查看方式，与单卡推理相同。
-
-### 多模态推理
-
-以`cogvlm2-llama3-chat-19B`模型为例，可以参考以下流程启动推理任务：
-
-修改模型配置文件[predict_cogvlm2_image_llama3_chat_19b.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/configs/cogvlm2/predict_cogvlm2_image_llama3_chat_19b.yaml)。
-
-```shell
-model:
-  model_config:
-    use_past: True                         # 开启增量推理
-    is_dynamic: False                      # 关闭动态shape
-
-  tokenizer:
-    vocab_file: "/{path}/tokenizer.model"  # 指定tokenizer文件路径
-```
-
-启动推理脚本
-
-```shell
-python run_mindformer.py \
- --config configs/cogvlm2/predict_cogvlm2_image_llama3_chat_19b.yaml \
- --run_mode predict \
- --predict_data "/path/image.jpg" "Please describe this image." \  # 模型推理输入，第一个输入是图片路径，第二个输入是文本
- --modal_type image text \                                         # 模型推理输入对应模态，图片路径对应'image'，文本对应'text'
- --load_checkpoint /{path}/cogvlm2-image-llama3-chat.ckpt
-```
-
-## 基于高阶接口推理
-
-> 基于安全性考虑，当前暂不推荐使用高阶接口进行推理，本章节将于下个版本下线。如有任何问题或建议，请通过[社区Issue](https://gitee.com/mindspore/mindformers/issues/new)提交反馈。感谢您的理解与支持！
-
-MindSpore Transformers除了提供 `run_mindformer` 统一脚本进行推理外，也支持用户自定义调用高阶接口`pipeline`或`chat`接口实现。
-
-### Pipeline接口
-
-基于 `pipeline` 接口的自定义文本生成推理任务流程，支持单卡推理和多卡推理。关于如何使用 `pipeline` 接口启动任务并输出结果，可以参考以下实现方式，具体参数说明可以查看 [pipeline 接口的API文档](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/mindformers/mindformers.pipeline.html#mindformers.pipeline)。
-
-#### 增量推理
-
-```python
-from mindformers import build_context
-from mindformers import AutoModel, AutoTokenizer, pipeline, TextStreamer
-
-# 构造输入
-inputs = ["I love Beijing, because", "LLaMA is a", "Huawei is a company that"]
-
-# 初始化环境
-build_context({'context': {'mode': 0}, 'run_mode': 'predict', 'parallel': {}, 'parallel_config': {}})
-
-# 实例化tokenizer
-tokenizer = AutoTokenizer.from_pretrained('llama2_7b')
-
-# 模型实例化
-# 修改成本地的权重路径
-model = AutoModel.from_pretrained('llama2_7b', checkpoint_name_or_path="path/to/llama2_7b.ckpt", use_past=True)
-# 模型实例化可使用魔乐社区模型在线加载，传入仓库名，格式为MindSpore-Lab/model_name
-# model = AutoModel.from_pretrained('MindSpore-Lab/qwen1_5_7b-chat')
-
-# pipeline启动非流式推理任务
-text_generation_pipeline = pipeline(task="text_generation", model=model, tokenizer=tokenizer)
-outputs = text_generation_pipeline(inputs, max_length=512, do_sample=False, top_k=3, top_p=1)
-for output in outputs:
-    print(output)
-```
-
-通过将示例保存到 `pipeline_inference.py` 中，并且修改加载权重的路径，然后直接执行 `pipeline_inference.py` 脚本。
-
-```shell
-python pipeline_inference.py
-```
-
-执行以上命令的推理结果如下：
-
-```text
-'text_generation_text': [I love Beijing, because it is a city that is constantly constantly changing. I have been living here for ......]
-'text_generation_text': [LLaMA is a large-scale, open-source, multimodal, multilingual, multitask, and multimodal pretrained language model. It is ......]
-'text_generation_text': [Huawei is a company that has been around for a long time. ......]
-```
-
-#### 流式推理
-
-```python
-from mindformers import build_context
-from mindformers import AutoModel, AutoTokenizer, pipeline, TextStreamer
-
-# 构造输入
-inputs = ["I love Beijing, because", "LLaMA is a", "Huawei is a company that"]
-
-# 初始化环境
-build_context({'context': {'mode': 0}, 'run_mode': 'predict', 'parallel': {}, 'parallel_config': {}})
-
-# 实例化tokenizer
-tokenizer = AutoTokenizer.from_pretrained('llama2_7b')
-
-# 模型实例化
-# 修改成本地的权重路径
-model = AutoModel.from_pretrained('llama2_7b', checkpoint_name_or_path="path/to/llama2_7b.ckpt", use_past=True)
-# 模型实例化可使用魔乐社区模型在线加载，传入模型名为Repo_id，格式为MindSpore-Lab/model_name
-# model = AutoModel.from_pretrained('MindSpore-Lab/qwen1_5_7b-chat')
-
-# pipeline启动流式推理任务
-streamer = TextStreamer(tokenizer)
-text_generation_pipeline = pipeline(task="text_generation", model=model, tokenizer=tokenizer, streamer=streamer)
-_ = text_generation_pipeline(inputs, max_length=512, do_sample=False, top_k=3, top_p=1)
-```
-
-通过将示例保存到 `pipeline_inference.py` 中，并且修改加载权重的路径，然后直接执行 `pipeline_inference.py` 脚本。
-
-```shell
-python pipeline_inference.py
-```
-
-执行以上命令的推理结果如下：
-
-```text
-'text_generation_text': [I love Beijing, because it is a city that is constantly constantly changing. I have been living here for ......]
-'text_generation_text': [LLaMA is a large-scale, open-source, multimodal, multilingual, multitask, and multimodal pretrained language model. It is ......]
-'text_generation_text': [Huawei is a company that has been around for a long time. ......]
-```
-
-### chat接口
-
-基于 `chat` 接口的对话文本生成推理任务流程，通过提供的分词器添加聊天模板后，对用户的查询进行推断。可以参考以下实现方式，具体参数说明可以查看 [chat 接口的API文档](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/generation/mindformers.generation.GenerationMixin.html#mindformers.generation.GenerationMixin.chat)。
-
-```python
-from mindformers import build_context
-from mindformers import AutoModel, AutoTokenizer
-
-# 构造输入
-query = "Hello!"
-
-# 初始化环境
-build_context({'context': {'mode': 0}, 'run_mode': 'predict', 'parallel': {}, 'parallel_config': {}})
-
-# 实例化tokenizer
-tokenizer = AutoTokenizer.from_pretrained('llama2_7b')
-
-# 模型实例化
-# 修改成本地的权重路径
-model = AutoModel.from_pretrained('llama2_7b', checkpoint_name_or_path="path/to/llama2_7b.ckpt", use_past=True)
-# 模型实例化可使用魔乐社区模型在线加载，传入仓库名，格式为MindSpore-Lab/model_name
-# model = AutoModel.from_pretrained('MindSpore-Lab/qwen1_5_7b-chat')
-
-# 调用chat接口启动推理任务
-response, history = model.chat(tokenizer=tokenizer, query=query, max_length=32)
-print(response)
-```
-
-通过将示例保存到 `chat_inference.py` 中，并且修改加载权重的路径，然后直接执行 `chat_inference.py` 脚本。
-
-```shell
-python chat_inference.py
-```
-
-执行以上命令的推理结果如下：
-
-```text
-Thanks, sir.
-```
-
-## 更多信息
-
-更多关于不同模型的推理示例，请访问[MindSpore Transformers 已支持模型库](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/start/models.html)。
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_zh_cn/usage/mindie_deployment.md b/docs/mindformers/docs/source_zh_cn/usage/mindie_deployment.md
deleted file mode 100644
index 88222751d7bb7bb93bd362400865b23122991716..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_zh_cn/usage/mindie_deployment.md
+++ /dev/null
@@ -1,349 +0,0 @@
-# 服务化部署
-
-[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_zh_cn/usage/mindie_deployment.md)
-
-## MindIE介绍
-
-MindIE，全称Mind Inference Engine，是基于昇腾硬件的高性能推理框架。详情参考[官方介绍文档](https://www.hiascend.com/software/mindie)。
-
-MindSpore Transformers承载在模型应用层MindIE LLM中，通过MindIE Service可以部署MindSpore Transformers中的大模型。
-
-MindIE推理的模型支持度可参考[模型库](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/start/models.html)。
-
-## 环境搭建
-
-### 软件安装
-
-1. 安装MindSpore Transformers
-
-   参考[MindSpore Transformers官方安装指南](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/quick_start/install.html)进行安装。
-
-2. 安装MindIE
-
-   参考[MindIE安装依赖文档](https://www.hiascend.com/document/detail/zh/mindie/100/envdeployment/instg/mindie_instg_0010.html)完成依赖安装。之后前往[MindIE资源下载中心](https://www.hiascend.com/developer/download/community/result?module=ie%2Bpt%2Bcann)下载软件包进行安装。
-
-   MindIE与CANN版本必须配套使用，其版本配套关系如下所示。
-
-   |                                           MindIE                                            |                                        CANN-toolkit                                         |                                        CANN-kernels                                         |
-   |:-------------------------------------------------------------------------------------------:|:-------------------------------------------------------------------------------------------:|:-------------------------------------------------------------------------------------------:|
-   | [1.0.0](https://www.hiascend.com/developer/download/community/result?module=ie%2Bpt%2Bcann) | [8.0.0](https://www.hiascend.com/developer/download/community/result?module=ie%2Bpt%2Bcann) | [8.0.0](https://www.hiascend.com/developer/download/community/result?module=ie%2Bpt%2Bcann) |
-
-### 环境变量
-
-若安装路径为默认路径，可以运行以下命令初始化各组件环境变量。
-
-```bash
-# Ascend
-source /usr/local/Ascend/ascend-toolkit/set_env.sh
-# MindIE
-source /usr/local/Ascend/mindie/latest/mindie-llm/set_env.sh
-source /usr/local/Ascend/mindie/latest/mindie-service/set_env.sh
-# MindSpore
-export LCAL_IF_PORT=8129
-# 组网配置
-export MS_SCHED_HOST=127.0.0.1     # scheduler节点ip地址
-export MS_SCHED_PORT=8090          # scheduler节点服务端口
-```
-
-> 若机器上有其他卡已启动MindIE，需要注意`MS_SCHED_PORT`参数是否冲突。日志打印中该参数报错的话，替换为其他端口号重新尝试即可。
-
-## 推理服务部署基本流程
-
-### 准备模型文件
-
-创建一个文件夹，用于存放MindIE后端的指定模型相关文件，如模型tokenizer文件、yaml配置文件和config文件等。
-
-```bash
-mkdir -p mf_model/qwen1_5_72b
-```
-
-以Qwen1.5-72B为例，文件夹目录结构如下：
-
-```reStructuredText
-mf_model
- └── qwen1_5_72b
-        ├── config.json                 # 模型json配置文件，Hugging Face上对应模型下载
-        ├── vocab.json                  # 模型vocab文件，Hugging Face上对应模型下载
-        ├── merges.txt                  # 模型merges文件，Hugging Face上对应模型下载
-        ├── predict_qwen1_5_72b.yaml    # 模型yaml配置文件
-        ├── qwen1_5_tokenizer.py        # 模型tokenizer文件，从mindformers仓中research目录下找到对应模型复制
-        └── qwen1_5_72b_ckpt_dir        # 模型分布式权重文件夹
-```
-
-predict_qwen1_5_72b.yaml需要关注以下配置：
-
-```yaml
-load_checkpoint: '/mf_model/qwen1_5_72b/qwen1_5_72b_ckpt_dir' # 为存放模型分布式权重文件夹路径
-use_parallel: True
-auto_trans_ckpt: False    # 是否开启自动权重转换，离线切分设置为False
-parallel_config:
-  data_parallel: 1
-  model_parallel: 4       # 多卡推理配置模型切分，一般与使用卡数一致
-  pipeline_parallel: 1
-processor:
-  tokenizer:
-    vocab_file: "/path/to/mf_model/qwen1_5_72b/vocab.json"  # vocab文件绝对路径
-    merges_file: "/path/to/mf_model/qwen1_5_72b/merges.txt"  # merges文件绝对路径
-```
-
-模型权重下载和转换可参考 [权重格式转换指南](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/function/weight_conversion.html)。
-
-不同模型的所需文件和配置可能会有差异，详情参考[模型库](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/start/models.html)中具体模型的推理章节。
-
-### 启动MindIE
-
-#### 1. 一键启动（推荐）
-
-mindformers仓上提供一键拉起MindIE脚本，脚本中已预置环境变量设置和服务化配置，仅需输入模型文件目录后即可快速拉起服务。
-
-进入`scripts`目录下，执行MindIE启动脚本：
-
-```shell
-cd ./scripts
-bash run_mindie.sh --model-name xxx --model-path /path/to/model
-
-# 参数说明
---model-name: 必传，设置MindIE后端名称
---model-path：必传，设置模型文件夹路径，如/path/to/mf_model/qwen1_5_72b
---help      : 脚本使用说明
-```
-
-查看日志：
-
-```bash
-tail -f output.log
-```
-
-当log日志中出现`Daemon start success!`，表示服务启动成功。
-
-#### 2. 自定义启动
-
-MindIE安装路径均为默认路径`/usr/local/Ascend/.` 如自定义安装路径，同步修改以下例子中的路径。
-
-打开mindie-service目录中的config.json，修改server相关配置。
-
-```bash
-vim /usr/local/Ascend/mindie/latest/mindie-service/conf/config.json
-```
-
-其中`modelWeightPath`和`backendType`必须修改配置为：
-
-```bash
-"modelWeightPath": "/path/to/mf_model/qwen1_5_72b"
-"backendType": "ms"
-```
-
-`modelWeightPath`为上一步创建出的模型文件夹，放置模型和tokenizer等相关文件；`backendType`后端启动方式必须为`ms`。
-
-其他相关参数如下：
-
-| 可选配置项          | 取值类型 | 取值范围             | 配置说明                                                                                                                                                                             |
-| ------------------- | -------- | -------------------- |----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| httpsEnabled        | Bool     | True/False           | 是否开启HTTPS通信安全认证，默认为True。便于启动，建议设置为False。                                                                                                                                         |
-| maxSeqLen           | int32    | 按用户需求自定义，>0 | 最大序列长度。输入的长度+输出的长度<=maxSeqLen，用户根据自己的推理场景选择maxSeqLen。                                                                                                                            |
-| npuDeviceIds        | list     | 按模型需求自定义     | 此配置项暂不生效。实际运行的卡由可见卡环境变量和worldSize配置控制。可见卡需调整资源参考[CANN环境变量](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha003/apiref/envref/envref_07_0029.html)。    |
-| worldSize           | int32    | 按模型需求自定义     | 可见卡的使用卡数。例：ASCEND_RT_VISIBLE_DEVICES=4,0,1,2且worldSize=2，则取第4，0卡运行。                                                                                                              |
-| npuMemSize          | int32    | 按显存自定义         | NPU中可以用来申请KVCache的size上限（GB），可按部署模型的实际大小计算得出：npuMemSize=(总空闲-权重/mp数量)*系数，其中系数取0.8。建议值：8。                                                                                         |
-| cpuMemSize          | int32    | 按内存自定义         | CPU中可以用来申请KVCache的size上限（GB），和swap功能有关，cpuMemSize不足时会将Cache释放进行重计算。建议值：5。                                                                                                        |
-| maxPrefillBatchSize | int32    | [1, maxBatchSize]    | 最大prefill batch size。maxPrefillBatchSize和maxPrefillTokens谁先达到各自的取值就完成本次组batch。该参数主要是在明确需要限制prefill阶段batch size的场景下使用，否则可以设置为0（此时引擎将默认取maxBatchSize值）或与maxBatchSize值相同。必填，默认值：50。 |
-| maxPrefillTokens    | int32    | [5120, 409600]       | 每次prefill时，当前batch中所有input token总数，不能超过maxPrefillTokens。maxPrefillTokens和maxPrefillBatchSize谁先达到各自的取值就完成本次组batch。必填，默认值：8192。                                                    |
-| maxBatchSize        | int32    | [1, 5000]            | 最大decode batch size，根据模型规模和NPU显存估算得出。                                                                                                                                            |
-| maxIterTimes        | int32    | [1, maxSeqLen-1]     | 可以进行的decode次数，即一句话最大可生成长度。请求级别里面有一个max_output_length参数，maxIterTimes是一个全局设置，与max_output_length取小作为最终output的最长length。                                                              |
-
-全量配置参数可查阅 [MindIE Service开发指南-快速开始-配置参数说明](https://www.hiascend.com/document/detail/zh/mindie/10RC3/mindieservice/servicedev/mindie_service0285.html)。
-
-运行启动脚本：
-
-```bash
-cd /path/to/mindie/latest/mindie-service
-nohup ./bin/mindieservice_daemon > output.log 2>&1 &
-tail -f output.log
-```
-
-当log日志中出现`Daemon start success!`，表示服务启动成功。
-
-Python相关日志：
-
-```bash
-export MINDIE_LLM_PYTHON_LOG_TO_FILE=1
-export MINDIE_LLM_PYTHON_LOG_PATH=/usr/local/Ascend/mindie/latest/mindie-service/logs/pythonlog.log
-tail -f /usr/local/Ascend/mindie/latest/mindie-service/logs/pythonlog.log
-```
-
-## MindIE服务化部署及推理示例
-
-以下例子各组件安装路径均为默认路径`/usr/local/Ascend/.` ， 模型使用`Qwen1.5-72B`。
-
-### 准备模型文件
-
-以Qwen1.5-72B为例，准备模型文件目录。目录结构及配置详情可参考[准备模型文件](#准备模型文件)：
-
-```bash
-mkdir -p mf_model/qwen1_5_72b
-```
-
-### 启动MindIE
-
-#### 1. 一键启动（推荐）
-
-进入`scripts`目录下，执行mindie启动脚本：
-
-```shell
-cd ./scripts
-bash run_mindie.sh --model-name qwen1_5_72b --model-path /path/to/mf_model/qwen1_5_72b
-```
-
-查看日志：
-
-```bash
-tail -f output.log
-```
-
-当log日志中出现`Daemon start success!`，表示服务启动成功。
-
-#### 2. 自定义启动
-
-打开mindie-service目录中的config.json，修改server相关配置。
-
-```bash
-vim /usr/local/Ascend/mindie/latest/mindie-service/conf/config.json
-```
-
-修改完后的config.json如下：
-
-```json
-{
-    "Version" : "1.0.0",
-    "LogConfig" :
-    {
-        "logLevel" : "Info",
-        "logFileSize" : 20,
-        "logFileNum" : 20,
-        "logPath" : "logs/mindservice.log"
-    },
-
-    "ServerConfig" :
-    {
-        "ipAddress" : "127.0.0.1",
-        "managementIpAddress" : "127.0.0.2",
-        "port" : 1025,
-        "managementPort" : 1026,
-        "metricsPort" : 1027,
-        "allowAllZeroIpListening" : false,
-        "maxLinkNum" : 1000,
-        "httpsEnabled" : false,
-        "fullTextEnabled" : false,
-        "tlsCaPath" : "security/ca/",
-        "tlsCaFile" : ["ca.pem"],
-        "tlsCert" : "security/certs/server.pem",
-        "tlsPk" : "security/keys/server.key.pem",
-        "tlsPkPwd" : "security/pass/key_pwd.txt",
-        "tlsCrl" : "security/certs/server_crl.pem",
-        "managementTlsCaFile" : ["management_ca.pem"],
-        "managementTlsCert" : "security/certs/management/server.pem",
-        "managementTlsPk" : "security/keys/management/server.key.pem",
-        "managementTlsPkPwd" : "security/pass/management/key_pwd.txt",
-        "managementTlsCrl" : "security/certs/management/server_crl.pem",
-        "kmcKsfMaster" : "tools/pmt/master/ksfa",
-        "kmcKsfStandby" : "tools/pmt/standby/ksfb",
-        "inferMode" : "standard",
-        "interCommTLSEnabled" : false,
-        "interCommPort" : 1121,
-        "interCommTlsCaFile" : "security/grpc/ca/ca.pem",
-        "interCommTlsCert" : "security/grpc/certs/server.pem",
-        "interCommPk" : "security/grpc/keys/server.key.pem",
-        "interCommPkPwd" : "security/grpc/pass/key_pwd.txt",
-        "interCommTlsCrl" : "security/certs/server_crl.pem",
-        "openAiSupport" : "vllm"
-    },
-
-    "BackendConfig" : {
-        "backendName" : "mindieservice_llm_engine",
-        "modelInstanceNumber" : 1,
-        "npuDeviceIds" : [[0,1,2,3]],
-        "tokenizerProcessNumber" : 8,
-        "multiNodesInferEnabled" : false,
-        "multiNodesInferPort" : 1120,
-        "interNodeTLSEnabled" : true,
-        "interNodeTlsCaFile" : "security/grpc/ca/ca.pem",
-        "interNodeTlsCert" : "security/grpc/certs/server.pem",
-        "interNodeTlsPk" : "security/grpc/keys/server.key.pem",
-        "interNodeTlsPkPwd" : "security/grpc/pass/mindie_server_key_pwd.txt",
-        "interNodeTlsCrl" : "security/grpc/certs/server_crl.pem",
-        "interNodeKmcKsfMaster" : "tools/pmt/master/ksfa",
-        "interNodeKmcKsfStandby" : "tools/pmt/standby/ksfb",
-        "ModelDeployConfig" :
-        {
-            "maxSeqLen" : 8192,
-            "maxInputTokenLen" : 8192,
-            "truncation" : false,
-            "ModelConfig" : [
-                {
-                    "modelInstanceType" : "Standard",
-                    "modelName" : "Qwen1.5-72B-Chat",
-                    "modelWeightPath" : "/mf_model/qwen1_5_72b",
-                    "worldSize" : 4,
-                    "cpuMemSize" : 15,
-                    "npuMemSize" : 15,
-                    "backendType" : "ms"
-                }
-            ]
-        },
-
-        "ScheduleConfig" :
-        {
-            "templateType" : "Standard",
-            "templateName" : "Standard_LLM",
-            "cacheBlockSize" : 128,
-
-            "maxPrefillBatchSize" : 50,
-            "maxPrefillTokens" : 8192,
-            "prefillTimeMsPerReq" : 150,
-            "prefillPolicyType" : 0,
-
-            "decodeTimeMsPerReq" : 50,
-            "decodePolicyType" : 0,
-
-            "maxBatchSize" : 200,
-            "maxIterTimes" : 4096,
-            "maxPreemptCount" : 0,
-            "supportSelectBatch" : false,
-            "maxQueueDelayMicroseconds" : 5000
-        }
-    }
-}
-```
-
-> 为便于测试，`httpsEnabled`参数设置为`false`，忽略后续https通信相关参数。
-
-进入mindie-service目录启动服务：
-
-```bash
-cd /usr/local/Ascend/mindie/1.0.RC3/mindie-service
-nohup ./bin/mindieservice_daemon > output.log 2>&1 &
-tail -f output.log
-```
-
-打印如下信息，启动成功。
-
-```bash
-Daemon start success!
-```
-
-### 请求测试
-
-服务启动成功后，可使用curl命令发送请求验证，样例如下：
-
-```bash
-curl -w "\ntime_total=%{time_total}\n" -H "Accept: application/json" -H "Content-type: application/json" -X POST -d '{"inputs": "I love Beijing, because","stream": false}' http://127.0.0.1:1025/generate
-```
-
-返回推理结果验证成功：
-
-```json
-{"generated_text":" it is a city with a long history and rich culture....."}
-```
-
-## 模型列表
-
-其他模型的MindIE推理示例可参考[模型库](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/start/models.html)中的各模型的介绍文档。
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_zh_cn/usage/multi_modal.md b/docs/mindformers/docs/source_zh_cn/usage/multi_modal.md
deleted file mode 100644
index 1607582a8f954e26ed3c917681797c0312e152eb..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_zh_cn/usage/multi_modal.md
+++ /dev/null
@@ -1,331 +0,0 @@
-# 多模态理解模型开发
-
-[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_zh_cn/usage/multi_modal.md)
-
-多模态理解模型（Multimodal Model）是指能够处理并结合来自不同模态（如文字、图像、音频、视频等）的信息进行学习和推理的人工智能模型。
-传统的单一模态模型通常只关注单一数据类型，如文本分类模型只处理文本数据，图像识别模型只处理图像数据。而多模态理解模型则通过融合不同来源的数据来完成更复杂的任务，从而能够理解和生成更加丰富、全面的内容。
-
-本文档旨在介绍MindSpore Transformers中的多模态理解模型，文档提供详细的步骤和示例指导用户使用MindSpore Transformers构建自定义的多模态理解模型和数据处理等模块。此外，用户还可以根据文档内容，完成模型的训练和推理等任务。
-
-MindSpore Transformers中多模态理解模型统一架构主要包括如下几个部分的内容：
-
-- [数据集构建](#数据集构建)
-- [数据处理模块](#数据处理模块)
-- [模型构建](#模型构建)
-    - [模型配置类](#模型配置类)
-    - [非文本模态处理模块](#非文本模态处理模块)
-    - [跨模态交互模块](#跨模态交互模块)
-    - [文本生成模块](#文本生成模块)
-- [多模态理解模型实践](#多模态理解模型实践)
-
-## 数据集构建
-
-在训练多模态理解模型之前，通常需要先完成多模态数据集的构建，MindSpore Transformers目前提供多模态数据的`dataset`类和`dataloader`类，用户可直接使用：
-
-- [BaseMultiModalDataLoader](https://gitee.com/mindspore/mindformers/blob/r1.5.0/mindformers/dataset/dataloader/multi_modal_dataloader.py)是多模态数据集加载类，主要完成从`json`文件中读取数据的功能；
-- [ModalToTextSFTDataset](https://gitee.com/mindspore/mindformers/blob/r1.5.0/mindformers/dataset/modal_to_text_sft_dataset.py)是多模态数据集处理类，主要完成多模态数据处理，以及数据集批处理、数据集重复等操作，具体多模态数据处理可参考[数据处理模块](#数据处理模块)；
-
-以下是`Cogvlm2-Video`模型的训练数据集`json`文件部分内容示例：
-
-```json
-[{
-    "id": "v_p1QGn0IzfW0.mp4",
-    "conversations": [
-      {
-        "from": "user",
-        "value": "<|reserved_special_token_3|>/path/VideoChatGPT/convert/v_p1QGn0IzfW0.mp4<|reserved_special_token_4|>What equipment is visible in the gym where the boy is doing his routine?"
-      },
-      {
-        "from": "assistant",
-        "value": "There is other equipment visible in the gym like a high bar and still rings."
-      }
-    ]
-}]
-```
-
-其中，`<|reserved_special_token_3|>`和`<|reserved_special_token_3|>`是`Cogvlm2-Video`模型中视频路径的标识符。
-
-用户可根据需要构造自定义的`json`文件，文件格式为一个包含多个字典的列表，每个字典代表一个数据样本，样本中`id`字段表示数据标识符，`conversations`字段表示多轮对话内容。
-
-在构造`json`文件之后，可运行下面的示例代码查看数据集中的数据样本：
-
-```python
-from mindformers.dataset.dataloader.multi_modal_dataloader import BaseMultiModalDataLoader
-
-# build data loader
-dataset_loader = BaseMultiModalDataLoader(
-  annotation_file = '/path/dataset.json', shuffle=False
-)
-print(dataset_loader[0])
-
-# ([['user', '<|reserved_special_token_3|>/path/VideoChatGPT/convert/v_p1QGn0IzfW0.mp4<|reserved_special_token_4|>What equipment is visible in the gym where the boy is doing his routine?'], ['assistant', 'There is other equipment visible in the gym like a high bar and still rings.']],)
-```
-
-## 数据处理模块
-
-在多模态理解模型的训练和推理过程中，都需要使用数据处理模块实现对多模态数据的预处理，该模块在训练时会在ModalToTextSFTDataset中被调用，推理时则是在[MultiModalToTextPipeline](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/pipeline/mindformers.pipeline.MultiModalToTextPipeline.html#mindformers.pipeline.MultiModalToTextPipeline)中被调用。
-
-下图是多模态数据的处理流程图，图中的自定义模块需要用户根据实际需求实现，其他模块直接调用即可。
-
-![multi_modal.png](image/multi_modal.png)
-
-下面以[CogVLm2-Video模型数据预处理模块](https://gitee.com/mindspore/mindformers/blob/r1.5.0/mindformers/models/cogvlm2/cogvlm2_processor.py)为例，介绍多模态数据处理模块中各组成部分的功能。
-
-1. BaseXModalToTextProcessor主要用于接收用于推理的多模态原始数据并对进行预处理操作，同时也实现了推理结果后处理操作，该类用户可直接使用；
-2. BaseXModalToTextTransform主要用于将`BaseXModalToTextProcessor`或多模态数据集返回的数据分别处理为推理或训练数据，该类用户可直接使用；
-3. [ModalContentTransformTemplate](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/models/mindformers.models.multi_modal.ModalContentTransformTemplate.html#mindformers.models.multi_modal.ModalContentTransformTemplate)是所有模态训推数据构建模块的抽象类，由于数据具体操作与模型相关，因此用户需要根据需求实现对应的自定义数据构建类，在`Cogvlm2-Video`模型中实现了`CogVLM2ContentTransformTemplate`类，实现了对视频以及文本数据的处理；
-4. ModalContentBuilder是所有单模态数据处理的抽象类，如果模型要处理多个模态的数据，就需要在自定义数据构建类初始化时创建多个对应的单模态数据处理类，在`Cogvlm2-Video`模型中实现了`CogVLM2VideoContentBuilder`类用于处理视频数据，并使用通用文本数据处理类`BaseTextContentBuilder`类处理文本数据。
-
-下面是`Cogvlm2-Video`模型训练、推理数据预处理的示例代码。
-
-### 模型训练数据处理
-
-在多模态理解模型训练任务中，数据预处理的配置通常会写在`train_dataset`中，`Cogvlm2-Video`模型训练配置文件中数据集相关配置如下：
-
-[finetune_cogvlm2_video_llama3_chat_13b_lora.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/configs/cogvlm2/finetune_cogvlm2_video_llama3_chat_13b_lora.yaml)
-
-```yaml
-train_dataset: &train_dataset
-  data_loader:
-    type: BaseMultiModalDataLoader
-    annotation_file: "/path/train_data.json"
-    shuffle: True
-  modal_to_text_transform:
-    type: BaseXModalToTextTransform
-    max_length: 2048
-    model_transform_template:
-      type: CogVLM2ContentTransformTemplate
-      output_columns: [ "input_ids", "images", "video_context_pos", "position_ids", "labels" ]
-      signal_type: "chat"
-      mode: 'train'
-      pos_pad_length: 2048
-  tokenizer:
-    add_bos_token: False
-    add_eos_token: False
-    max_length: 2048
-    pad_token: "<|reserved_special_token_0|>"
-    vocab_file: "/path/tokenizer.model"
-    type: CogVLM2Tokenizer
-```
-
-其中，`annotation_file`为训练数据的`json`文件路径，`modal_to_text_transform`与`tokenizer`都应该与推理配置中`processor`中的类似。
-
-```python
-from mindformers.tools.register.config import MindFormerConfig
-from mindformers.dataset.modal_to_text_sft_dataset import ModalToTextSFTDataset
-
-# load configs
-configs = MindFormerConfig("configs/cogvlm2/finetune_cogvlm2_video_llama3_chat_13b_lora.yaml")
-# build dataset
-multi_modal_dataset = ModalToTextSFTDataset(**configs.train_dataset)
-# iterate dataset
-for item in multi_modal_dataset:
-    print(len(item))
-    break
-# 5, output 5 columns
-```
-
-### 模型推理数据处理
-
-`Cogvlm2-Video`模型推理配置文件中数据处理模块的配置如下：
-
-[predict_cogvlm2_video_llama3_chat_13b.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/configs/cogvlm2/predict_cogvlm2_video_llama3_chat_13b.yaml)
-
-```yaml
-processor:
-  type: BaseXModalToTextProcessor
-  model_transform_template:
-    type: CogVLM2ContentTransformTemplate
-    output_columns: [ "input_ids", "position_ids", "images", "video_context_pos" ]
-    vstack_columns: [ "images", "video_context_pos" ]
-    signal_type: "chat"
-    pos_pad_length: 2048
-  tokenizer:
-    add_bos_token: False
-    add_eos_token: False
-    max_length: 2048
-    pad_token: "<|reserved_special_token_0|>"
-    vocab_file: "/path/tokenizer.model"
-    type: CogVLM2Tokenizer
-```
-
-其中，`vocab_file`为实际使用词表文件路径，其他参数为模型相关配置，用户可按需进行自定义配置。
-
-下面是多模态数训练据处理示例代码，与训练数据不同的是，通过数据处理可以得到一个包含`input_ids`等处理后的数据的字典，而不是一个列表。
-
-```python
-from mindformers.tools.register.config import MindFormerConfig
-from mindformers.models.multi_modal.base_multi_modal_processor import BaseXModalToTextProcessor
-from mindformers.models.cogvlm2.cogvlm2_tokenizer import CogVLM2Tokenizer
-
-# build processor
-configs = MindFormerConfig("configs/cogvlm2/predict_cogvlm2_video_llama3_chat_13b.yaml")
-configs.processor.tokenizer = tokenizer = CogVLM2Tokenizer(**configs.processor.tokenizer)
-processor = BaseXModalToTextProcessor(**configs.processor)
-
-# process data
-multi_modal_data = [
-  {'video': "/path/video.mp4"},
-  {'text': "Please describe this video."}
-]
-
-print(processor(multi_modal_data).keys())
-# dict_keys(['input_ids', 'position_ids', 'images', 'video_context_pos'])
-```
-
-在实现多模态数据集构建以及数据处理模块之后，就可以得到多模态理解模型可以处理的数据，下面将介绍如何构建多模态大模型。
-
-## 模型构建
-
-多模态大模型通常包括非文本模态处理模块、跨模态交互模块以及文本生成模块三个部分，其中非文本模态处理模块通常为经过大规模数据预训练后的视觉模型，
-文本生成模块通常为文本生成大模型，跨模态交互模块通常由多个线性层组成。
-
-### 模型配置类
-
-MindSpore Transformers中多模态理解模型相关参数主要通过模型配置类进行控制，下面以`CogVLM2Config`类为例介绍如何构建模型配置类，
-具体实现可参考[CogVLM2Config](https://gitee.com/mindspore/mindformers/blob/r1.5.0/mindformers/models/cogvlm2/cogvlm2_config.py)。
-
-```python
-@MindFormerRegister.register(MindFormerModuleType.CONFIG)
-class CogVLM2Config(PretrainedConfig):
-    def __init__(self,
-                 vision_model: PretrainedConfig,
-                 llm_model: PretrainedConfig,
-                 **kwargs):
-        super().__init__(**kwargs)
-        self.vision_model = vision_model
-        self.llm_model = llm_model
-```
-
-参数说明：
-
-1. `@MindFormerRegister.register(MindFormerModuleType.CONFIG)`主要用于注册自定义的模型配置类，注册后的模型配置类可在`yaml`文件中通过名称进行调用；
-2. `vision_model`和`llm_model`分别表示视觉模型以及文本生成模型的配置类，作为多模态理解模型配置类的入参，并在类初始化过程中对其进行处理；
-3. `PretrainedConfig`是所有模型配置的基类，具体可参考[PretrainedConfig](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/models/mindformers.models.PretrainedConfig.html#mindformers.models.PretrainedConfig)。
-
-在配置文件中，按如下结构对模型进行配置，
-具体实现可参考[predict_cogvlm2_video_llama3_chat_13b.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/configs/cogvlm2/predict_cogvlm2_video_llama3_chat_13b.yaml)。
-
-```yaml
-model:
-  model_config:
-    type: MultiModalConfig
-    vision_model:
-      arch:
-        type: EVAModel
-      model_config:
-        type: EVA02Config
-        image_size: 224
-        patch_size: 14
-        hidden_size: 1792
-        num_hidden_layers: 63
-        ...
-    llm_model:
-      arch:
-        type: CogVLM2VideoLM
-      model_config:
-        type: LlamaConfig
-        seq_length: 2048
-        hidden_size: 4096
-        num_layers: 32
-        ...
-  arch:
-    type: CogVLM2ForCausalLM
-```
-
-在该配置文件中，将`EVAModel`、`EVA02Config`作为`vision_model`模型及其配置类，将`CogVLM2VideoLM`、`LlamaConfig`作为`llm_model`模型及其配置类，
-由此构成多模态理解模型`CogVLM2ForCausalLM`，这些类都是MindSpore Transformers已实现的模块，下面将介绍如何实现自定义模块。
-
-### 非文本模态处理模块
-
-MindSpore Transformers提供`ViT`、`EVA02`等模型作为视觉信息处理模块，下面以`EVA02`模型为例介绍如何构建非文本模态处理模块，
-具体可参考[EVAModel](https://gitee.com/mindspore/mindformers/blob/r1.5.0/mindformers/models/eva02/eva.py)和[EVA02Config](https://gitee.com/mindspore/mindformers/blob/r1.5.0/mindformers/models/eva02/eva_config.py)。
-
-```python
-from mindformers.tools.register import MindFormerRegister, MindFormerModuleType
-from mindformers.models.modeling_utils import PreTrainedModel
-from mindformers.models.eva02.eva_config import EVA02Config
-
-class EVA02PreTrainedModel(PreTrainedModel):
-    config_class = EVA02Config
-    base_model_prefix = "eva02"
-
-@MindFormerRegister.register(MindFormerModuleType.MODELS)
-class EVAModel(EVA02PreTrainedModel):
-    def __init__(self, config=None):
-        config = config if config else EVA02Config()
-        super().__init__(config)
-```
-
-参数说明：
-
-1. `@MindFormerRegister.register(MindFormerModuleType.MODELS)`主要用于注册自定义的模型类，注册后的模型类可在`yaml`文件中通过名称进行调用；
-2. `EVA02PreTrainedModel`继承自`PreTrainedModel`类，主要用于指定模型配置类以及模型参数名的前缀，`EVAModel`作为模型的具体实现，承自`EVA02PreTrainedModel`类，相关API说明可参考[PreTrainedModel](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/models/mindformers.models.PreTrainedModel.html#mindformers.models.PreTrainedModel)；
-3. `EVAModel`主要对数据中的视觉信息进行处理，将处理后的视觉特征输入**跨模态交互模块**。
-
-### 跨模态交互模块
-
-文本生成模块通常为经过预训练的大语言模型，而非文本模态处理模块为经过大规模非文本数据预训练后的模型，其输出特征和与文本特征中所包含的信息差异过大，无法直接输入到文本生成模块中进行推理，因此需要构造与文本生成模块相匹配的跨模态交互模块，将视觉特征处理为文本生成模块可处理的向量。
-
-下面以`CogVLM2-Video`模型中的`VisionMLPAdapter`为例，介绍跨模态交互模块的结构与功能。
-
-```python
-class VisionMLPAdapter(nn.Cell):
-    def __init__(self, vision_grid_size, vision_hidden_size, text_hidden_size, text_intermediate_size,
-                 compute_dtype=ms.float16, param_init_type=ms.float16):
-        super().__init__()
-        self.grid_size = vision_grid_size
-        self.linear_proj = GLU(in_features=vision_hidden_size,
-                               hidden_size=text_hidden_size,
-                               intermediate_size=text_intermediate_size,
-                               compute_dtype=compute_dtype, param_init_type=param_init_type)
-        self.conv = nn.Conv2d(in_channels=vision_hidden_size, out_channels=vision_hidden_size,
-                              kernel_size=2, stride=2, dtype=param_init_type, has_bias=True).to_float(compute_dtype)
-```
-
-在`VisionMLPAdapter`中会将`EVAModel`的输出通过Linear、Conv2D等操作处理成与文本特征相同的维度，其中`vision_hidden_size`和`text_hidden_size`分别表示视觉和文本特征维度。
-
-### 文本生成模块
-
-MindSpore Transformers提供`Llama2`、`Llama3`等语言大模型作为文本生成模块，与非文本模态处理模块、跨模态交互模块共同构成多模态理解模型。
-
-```python
-@MindFormerRegister.register(MindFormerModuleType.MODELS)
-class MultiModalForCausalLM(BaseXModalToTextModel):
-    def __init__(self, config: MultiModalConfig, **kwargs):
-        super().__init__(config, **kwargs)
-        self.config = config
-        self.vision_model = build_network(config.vision_model)
-        self.llm_model = build_network(config.llm_model)
-        self.mlp_adapter = VisionMLPAdapter(**kwargs)
-
-    def prepare_inputs_for_generation(self, input_ids, **kwargs):
-      """Prepare inputs for generation in inference."""
-
-    def prepare_inputs_for_predict_layout(self, input_ids, **kwargs):
-      """Prepare inputs for generation in inference."""
-
-    def set_dynamic_inputs(self, **kwargs):
-      """Set dynamic inputs for model."""
-
-    def construct(self, input_ids, **kwargs):
-      """Model forward."""
-```
-
-参数说明：
-
-1. `MultiModalForCausalLM`作为多模态理解模型类，继承自基类`BaseXModalToTextModel`，在该类构建过程中通过`build_network`和对应模块的配置，对非文本模态处理模块`vision_model`、文本生成模块`llm_model`以及跨模态交互模块`VisionMLPAdapter`进行初始化；
-2. `prepare_inputs_for_generation`方法可以对输入数据进行预处理，要求处理后的数据可通过`construct`方法实现模型推理；
-3. `prepare_inputs_for_predict_layout`方法用于构造模型可处理的数据，其返回值与`construct`方法入参对应，通过构造后的数据可实现模型编译；
-4. `set_dynamic_inputs`方法可以为模型入参中的部分数据配置动态shape；
-5. `construct`方法为所有模型通用接口，也是模型前向执行函数。
-
-## 多模态理解模型实践
-
-在实现多模态数据集、数据处理模块以及多模态理解模型构建之后，就可以通过模型配置文件启动模型预训练、微调、推理等任务，为此需要构建对应的模型配置文件。
-
-具体模型配置文件可参考[predict_cogvlm2_video_llama3_chat_13b.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/configs/cogvlm2/predict_cogvlm2_video_llama3_chat_13b.yaml)和[finetune_cogvlm2_video_llama3_chat_13b_lora.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/configs/cogvlm2/finetune_cogvlm2_video_llama3_chat_13b_lora.yaml)分别对应模型推理和微调，其中参数具体含义可查阅[配置文件说明](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/appendix/conf_files.html)。
-
-在用户自定义的配置文件中`model`、`processor`、`train_dataset`等部分内容需要对应用户自定义的**数据集**、**数据处理模块**以及**多模态理解模型**进行设置。
-
-编辑自定义的配置文件之后，参考[CogVLM2-Video模型文档](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/cogvlm2_video.md)启动模型[推理](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/cogvlm2_video.md#推理)和[微调](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/cogvlm2_video.md#微调)任务即可。
diff --git a/docs/mindformers/docs/source_zh_cn/usage/pre_training.md b/docs/mindformers/docs/source_zh_cn/usage/pre_training.md
deleted file mode 100644
index a4c4337a354f8692a04bee80d4d525b32719ce01..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_zh_cn/usage/pre_training.md
+++ /dev/null
@@ -1,89 +0,0 @@
-# 预训练
-
-[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_zh_cn/usage/pre_training.md)
-
-## 概述
-
-预训练是指在大规模无标注数据上训练模型，使其能够全面捕捉语言的广泛特性。通过预训练，模型可以学习到词汇、句法和语义等层面的知识，这些知识在下游任务中通过微调得到应用，从而优化特定任务的性能。MindSpore Transformers框架的预训练目标是帮助开发者快速、便捷地构建和训练基于Transformer架构的预训练模型。
-
-## 预训练的基本操作流程
-
-结合实际操作，预训练的基本流程可以分解为以下步骤：
-
-1. **数据集准备：**
-   预训练需要在一个大规模、未标注的文本数据集上进行，这些数据集通常包含来自网络、书籍、文章等多种来源的大量文本。数据集的多样性和规模对模型的泛化能力有很大影响。
-
-2. **选择模型架构：**
-   根据任务需求和计算资源，选择合适的模型架构来构建预训练模型。
-
-3. **执行预训练：**
-   在准备好的大规模数据集上执行预训练，使用配置好的模型架构和训练配置进行长时间的训练，生成最终的预训练模型权重。
-
-4. **保存模型：**
-   训练完成后，将模型权重保存到指定位置。
-
-## 基于MindSpore Transformers的预训练实践
-
-MindSpore Transformers目前已经支持业界主流大模型，该实践流程选择以Llama2-7B和Llama3-70B分别展示[单机训练](#单机训练)和[多机训练](#多机训练)。
-
-### 数据集准备
-
-| 数据集名称   |    适用模型    |   适用阶段   |                                      下载链接                                       |
-|:--------|:----------:|:--------:|:-------------------------------------------------------------------------------:|
-| Wikitext2 | Llama2-7B  | Pretrain | [Link](https://ascend-repo-modelzoo.obs.cn-east-2.myhuaweicloud.com/MindFormers/dataset/wikitext-2/wikitext-2-v1.zip) |
-| Wiki103 | Llama3-70B | Pretrain |    [Link](https://dagshub.com/DagsHub/WIkiText-103/src/main/dataset/tokens)     |
-
-### 数据预处理
-
-其中Llama2-7B的数据集处理可参考[Wikitext2数据预处理](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/llama2.md#数据及权重准备)，Llama3-70B的数据集处理可参考[Wiki103数据预处理](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/llama3/README.md#数据集及权重准备)。
-
-## 执行预训练任务
-
-### 单机训练
-
-以Llama2-7B为例，通过指定配置文件[pretrain_llama2_7b.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/configs/llama2/pretrain_llama2_7b.yaml)以msrun的方式启动[run_mindformer.py](https://gitee.com/mindspore/mindformers/blob/r1.5.0/run_mindformer.py)脚本，进行8卡分布式训练，启动命令如下：
-
-```bash
-bash scripts/msrun_launcher.sh "run_mindformer.py \
- --config configs/llama2/pretrain_llama2_7b.yaml \
- --train_dataset_dir /{path}/wiki4096.mindrecord \
- --use_parallel True \
- --run_mode train" 8
-
- # 参数说明：
- config：            模型的配置文件，文件在MindSpore Transformers代码仓中config目录下
- train_dataset_dir： 训练数据集路径
- use_parallel：      是否开启并行
- run_mode：          运行模式，train：训练，finetune：微调，predict：推理
- ```
-
-任务执行完成后，在mindformers/output目录下，会生成checkpoint文件夹，同时模型文件会保存在该文件夹下。
-
-### 多机训练
-
-以Llama3-70B为例，使用[pretrain_llama3_70b.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/llama3/llama3_70b/pretrain_llama3_70b.yaml)配置文件，以msrun方式运行[run_mindformer.py](https://gitee.com/mindspore/mindformers/blob/r1.5.0/run_mindformer.py)执行8机64卡预训练。多机多卡执行脚本进行分布式训练需要分别在不同节点运行脚本，并将参数**MASTER_ADDR**设置为主节点的ip地址，所有节点设置的ip地址相同，不同节点之间仅参数**NODE_RANK**不同，各个参数位置含义参见[msrun启动使用指南](https://www.mindspore.cn/tutorials/zh-CN/r2.6.0/parallel/msrun_launcher.html)。
-
-```shell
-# 节点0，设0节点ip为MASTER_ADDR，作为主节点ip，总共64卡且每个节点8卡
-# 节点0、节点1、...节点7 依此修改node_num，比如8机，node_num为0~7。
-bash scripts/msrun_launcher.sh "run_mindformer.py \
- --register_path research/llama3 \
- --config research/llama3/llama3_70b/pretrain_llama3_70b.yaml \
- --train_dataset dataset_dir \
- --use_parallel True \
- --run_mode train" \
- 64 8 {MASTER_ADDR} 8118 {node_num} output/msrun_log False 300
-
- # 参数说明：
- register_path：     模型API的注册路径，是一个包含模型Python文件的目录路径（可以是research目录下模型文件夹的路径）
- config：            模型的配置文件，文件在MindSpore Transformers代码仓中config目录下
- train_dataset_dir： 训练数据集路径
- use_parallel：      是否开启并行
- run_mode：          运行模式，train：训练，finetune：微调，predict：推理
-```
-
-**注意**： 在多机分布式训练的过程中，可能会遇到一些性能问题。为了确保训练过程的高效性和稳定性，建议参考[大模型性能调优指南](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/perf_optimize/perf_optimize.html)，进行必要的性能优化和调整。
-
-## 更多信息
-
-更多关于不同模型的训练示例，请访问[MindSpore Transformers已支持模型库](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/start/models.html)。
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_zh_cn/usage/pretrain_gpt.md b/docs/mindformers/docs/source_zh_cn/usage/pretrain_gpt.md
deleted file mode 100644
index 02798ca369b615a2c5831d0a02c0724db76b64eb..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_zh_cn/usage/pretrain_gpt.md
+++ /dev/null
@@ -1,505 +0,0 @@
-# 动态图并行
-
-[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_zh_cn/usage/pretrain_gpt.md)
-
-## 概述
-
-本教程演示如何使用MindSpore Transformers动态图并行框架训练GPT模型，此框架支持张量并行、流水线并行、序列并行等并行场景，还有支持使用分布式优化器动态学习率等场景，帮助开发者快速、便捷地构建和训练基于动态图并行框架的GPT预训练模型。
-
-## 操作实践
-
-下面基于Ascend平台，进行GPT模型训练。
-
-### 样例代码参考
-
-目录结构如下：
-
-```text
-└─ gpt
-    ├─ pretrain_gpt.py
-    ├─ pretrain_gpt.sh
-    └─ pretrain_gpt_7B.yaml
-    ...
-```
-
-其中，`pretrain_gpt.py`是环境配置、模型对象创建及训练的脚本。`pretrain_gpt.sh`是启动执行脚本。`pretrain_gpt_7B.yaml`是配置项。
-
-### 模型结构
-
-GPT以`Transformer`模型为主要架构，网络结构主要围绕`Transformer`的基本构建块构建。
-
-在模型中，初始化五个参数，`config`是模型配置项（在yaml文件的`model_config`中），`num_tokentypes`指定embedding的类型，`parallel_output`用来确认是否输出每一个并行Tensor的输出，`pre_process`和`post_process`分别指定是否为第一阶段和最后一阶段。
-
-调用的`get_language_model`是一个基于`Transformer`模型的接口，详情请看`get_language_model`的api文档。
-
-注意：数据集返回值要与模型定义的前向过程所需要的参数相对应。
-
-```python
-from mindformers.experimental.parallel_core.pynative.transformer.module import Module
-from mindformers.experimental.parallel_core.pynative.transformer.language_model import get_language_model
-from mindformers.experimental.parallel_core.pynative.transformer import ParallelLMLogits
-from mindformers.experimental.parallel_core.pynative.training.loss_func import VocabParallelCrossEntropy
-
-
-class AttnMaskType(enum.Enum):
-    padding = 1
-    causal = 2
-    no_mask = 3
-    padding_causal = 4
-
-
-attn_mask_type_mapping = {
-    "padding": AttnMaskType.padding,
-    "causal": AttnMaskType.causal,
-}
-
-
-class GPTModel(Module):
-    def __init__(self,
-                 config,
-                 num_tokentypes=0,
-                 parallel_output=True,
-                 pre_process=True,
-                 post_process=True):
-        super().__init__(config=config,\
-                         share_embeddings_and_output_weights=not config.untie_embeddings_and_output_weights)
-
-        self.parallel_output = parallel_output
-        self.pre_process = pre_process
-        self.post_process = post_process
-        self.untie_embeddings_and_output_weights = config.untie_embeddings_and_output_weights
-        self.fp16_lm_cross_entropy = config.fp16_lm_cross_entropy
-
-        self.set_model_key()
-        encoder_attn_mask_type = None
-        if config.encoder_attn_mask_type is not None:
-            encoder_attn_mask_type = attn_mask_type_mapping.get(config.encoder_attn_mask_type)
-            if encoder_attn_mask_type is None:
-                raise ValueError(f"encoder_attn_mask_type must be one of {attn_mask_type_mapping.keys()}, but got"
-                                 f"{config.encoder_attn_mask_type}")
-
-        self.language_model, self._language_model_key = get_language_model(
-            config=config,
-            num_tokentypes=num_tokentypes,
-            add_pooler=False,
-            encoder_attn_mask_type=encoder_attn_mask_type,
-            pre_process=self.pre_process,
-            post_process=self.post_process)
-
-        if self.post_process:
-            self.parallel_lm_logits = ParallelLMLogits(config=config,
-                                                       bias=False,
-                                                       compute_dtype=config.compute_dtype)
-            self.loss = VocabParallelCrossEntropy()
-
-        if not config.untie_embeddings_and_output_weights:
-            self.initialize_word_embeddings()
-
-    def set_input_tensor(self, input_tensor):
-        """ set input_tensor to model """
-        self.language_model.set_input_tensor(input_tensor)
-
-    def set_model_key(self):
-        """ set model key for differentiate PipelineCell process """
-        self.model_key = "gpt3"
-
-    def construct(self, input_ids, position_ids, attention_mask, loss_mask,
-                  retriever_input_ids=None,
-                  retriever_position_ids=None,
-                  retriever_attn_mask=None,
-                  labels=None, tokentype_ids=None, inference_params=None):
-        """ gpt model forward """
-        # use RoPE
-        position_ids = None
-        retriever_input_ids = None
-        retriever_position_ids = None
-        retriever_attn_mask = None
-        lm_output = self.language_model(
-            input_ids,
-            position_ids,
-            attention_mask,
-            retriever_input_ids=retriever_input_ids,
-            retriever_position_ids=retriever_position_ids,
-            retriever_attn_mask=retriever_attn_mask,
-            inference_params=inference_params)
-        if self.post_process:
-            return post_language_model_processing(
-                self.parallel_lm_logits, self.loss,
-                lm_output, labels,
-                self.language_model.output_layer.weight if\
-                    self.untie_embeddings_and_output_weights else self.shared_embedding_or_output_weight(),
-                self.parallel_output,
-                self.fp16_lm_cross_entropy,
-                loss_mask)
-        else:
-            return lm_output
-```
-
-当`post_process`为`True`时，需要对语言模型的输出`lm_output`进行后处理，输出损失和预测结果。
-
-```python
-import mindspore.common.dtype as mstype
-
-def post_language_model_processing(parallel_lm_logits, loss_fn, lm_output, labels, logit_weights,
-                                   parallel_output, fp16_lm_cross_entropy, loss_mask):
-    """ gpt model post process forward """
-    output = parallel_lm_logits(lm_output, logit_weights, parallel_output)
-
-    if labels is None:
-        return output
-
-    labels = labels
-    loss_mask = loss_mask.reshape(-1)
-
-    if fp16_lm_cross_entropy:
-        if output.dtype != mstype.float16:
-            raise ValueError(f"When fp16_lm_cross_entropy=True, output should be float16, but got {output.dtype}")
-        loss = loss_fn(output, labels, loss_mask)
-    else:
-        loss = loss_fn(output.astype(mstype.float32), labels)
-    token_nums = loss_mask.sum()
-    loss_mask = loss_mask.astype(mstype.float32)
-    loss = ops.sum(loss * loss_mask.float()) / loss_mask.sum()
-    return loss, output, token_nums
-```
-
-### 动态图并行训练配置
-
-动态图并行的配置项通过yaml文件来读取，并分为不同种类，包括训练配置、并行配置、模型配置等，接下来简单介绍一下大模型训练需要的基本配置。
-
-#### 配置训练参数（training_config）
-
-```yaml
-training_config:
-  seed: 42                                        # 固定随机性用的种子
-  output_dir: './output'                          # 输出目录，用于储存checkpoints和日志等
-  training_iters: 10                              # 训练迭代次数
-  log_interval: 1                                 # 日志打印的频率
-  save_interval: null                             # 储存checkpoints的频率
-  loss_scale: 4096                                # loss scale的初始值
-  grad_clip_kwargs:
-    grad_clip_type: "ClipGlobalNorm"              # 梯度裁剪的方法，可选："ClipGlobalNorm"或者"GradClipByValue"
-    clip_value: 1.0
-  loss_reduction: "mean"                          # loss reduction的方法，可选："mean"或者"sum"
-  loss_func_kwargs:
-    loss_func_type: "VocabParallelCrossEntropy"   # 损失函数，可选: "VocabParallelCrossEntropy"或者"CrossEntropyLoss"
-  use_distributed_optimizer: True                 # 是否使用分布式优化器
-```
-
-#### 配置并行模式（parallel_config）
-
-```yaml
-parallel_config:
-  tensor_model_parallel_size: 1                    # 张量并行
-  pipeline_model_parallel_size: 1                  # 流水线并行
-  expert_model_parallel_size: 1                    # 专家并行
-  virtual_pipeline_model_parallel_size: null       # 虚拟流水线并行
-  sequence_parallel: False                         # 序列并行
-```
-
-#### 配置模型参数（gpt_config）
-
-```yaml
-model_config:
-  params_dtype: "float32"                          # 参数初始化类型
-  compute_dtype: "bfloat16"                        # 计算时使用的类型
-  position_embedding_type: 'rope'                  # 位置编码的类型，可选："rope"或者"absolute"
-  untie_embeddings_and_output_weights: True        # embedding层和head层是否不共享权重
-  # 配置GPT 7B模型
-  num_layers: 6                                    # Transformer层数
-  hidden_size: 4096                                # 隐藏层的大小
-  ffn_hidden_size: 11008                           # 前馈神经网络隐藏层大小
-  num_attention_heads: 32                          # 注意力头的数量
-```
-
-GPT模型当前有三种不同规格的配置：7B、13B和70B。
-
-```yaml
-7B:
-  num_layers: 32
-  hidden_size: 4096
-  ffn_hidden_size: 11008
-  num_attention_heads: 32
-13B:
-  num_layers: 40
-  hidden_size: 5120
-  ffn_hidden_size: 13824
-  num_attention_heads: 40
-70B:
-  num_layers: 80
-  hidden_size: 8192
-  ffn_hidden_size: 28672
-  num_attention_heads: 64
-  group_query_attention: True
-  num_query_groups: 8
-```
-
-#### 数据集配置（dataset_config）
-
-```yaml
-dataset_config:
-  batch_size: 1                                    # 一次迭代从数据集中取出的数据大小
-  micro_batch_num: 2                               # 微批次个数
-  dataset_dir: './dataset'                         # 数据集所在目录
-  shuffle: False                                   # 是否打乱顺序
-```
-
-#### 优化器配置（optimizer_config）
-
-```yaml
-optimizer_config:
-  optimizer_type: "AdamW"                          # 优化器类型，可选："AdamW", "Adam", "SGD", "Came", "mint.AdamW"及"SpeedAdamW"
-  betas:                                           # 优化器输入参数
-    - 0.9
-    - 0.95
-  eps: 1.e-8
-  learning_rate: 1.25e-6                           # 初始学习率
-  weight_decay: 1.e-1                              # 权重衰减系数
-  learning_rate_scheduler_kwargs:                  # 学习率调整策略
-    warmup_steps: 200
-    decay_steps: 2000
-    use_cosine: True
-    end_learning_rate: 1.25e-7
-```
-
-### 模型训练配置解析
-
-在pretrain_gpt.py里对传入的yaml配置文件进行解析，可以得到训练配置、模型配置、优化器配置、并行策略配置以及数据集配置。
-
-```python
-import argparse
-from mindformers.experimental.parallel_core.pynative.config import (
-    init_configs_from_yaml
-)
-
-def get_arg_parser():
-    """get argument parser"""
-    parser = argparse.ArgumentParser(description="Train gpt model")
-    parser.add_argument("--config_path", type=str, default="pretrain_gpt.yaml", help="The path to the config file.")
-    parser.add_argument("--run_cmd", type=str, default="", help="running cmd.")
-    parser.add_argument("--model_type", type=str, default="gpt_config", help="Input model config.")
-    return parser
-parser = get_arg_parser()
-args = parser.parse_args()
-
-all_config = init_configs_from_yaml(args.config_path)
-
-training_config = all_config.training_config
-model_config = all_config.model_config
-optimizer_config = all_config.optimizer_config
-parallel_config = all_config.parallel_config
-dataset_config = all_config.dataset_config
-```
-
-### 通信配置
-
-通过set_context接口可以指定运行模式、运行设备、运行卡号等。并行脚本还需指定并行模式`parallel_mode`为数据并行模式，并通过init根据不同的设备需求初始化HCCL、NCCL或者MCCL通信。指定平台：设置`device_target`为`Ascend`。调试阶段可以使用`set_context(pynative_synchronize=True)`开启同步模式，更准确地定位报错位置。
-
-```python
-import mindspore as ms
-
-
-def set_parallel_context(parallel_config):
-    init()
-    initialize_model_parallel(
-        tensor_model_parallel_size=parallel_config.tensor_model_parallel_size,
-        pipeline_model_parallel_size=parallel_config.pipeline_model_parallel_size,
-        virtual_pipeline_model_parallel_size=parallel_config.virtual_pipeline_model_parallel_size,
-    )
-    logger.info(
-        f"dp {get_data_parallel_world_size()} | "
-        f"pp {parallel_config.pipeline_model_parallel_size} | "
-        f"tp {parallel_config.tensor_model_parallel_size} | "
-        f"sp {parallel_config.sequence_parallel} | "
-        f"vpp {parallel_config.virtual_pipeline_model_parallel_size}"
-    )
-
-
-def set_seed(seed):
-    # set global seed, np seed, and dataset seed
-    ms.set_seed(seed)
-    # set rng seed
-    ms.manual_seed(seed)
-
-
-ms.set_context(mode=ms.PYNATIVE_MODE)
-ms.set_device(device_target="Ascend")
-set_parallel_context(parallel_config)
-set_seed(training_config.seed)
-```
-
-### 创建网络对象
-
-从模型库获取GPT模型，根据配置文件创建网络模型对象。通过`set_weight_decay`来为不同参数设置不同的权重衰减系数，这个函数会将参数分为两组，一组应用特定的权重衰减值，另一组权重衰减为`0`，然后返回一个包含参数分组信息的列表，赋值给`group_params`变量。调用`get_optimizer`函数，传入`optimizer_config`（优化器配置）、`training_config`（训练配置）、`group_params`（前面得到的参数分组信息）、`network_with_loss`（包含模型和损失的对象）以及一个梯度归约操作（从`training_config.loss_reduction`获取），返回一个优化器对象，并赋值给`optimizer`变量。
-创建一个`TrainOneStepCell`对象，它通常用于在训练过程中执行一步优化。传入`network_with_loss`、`optimizer`及配置作为参数，并将其赋值给train_one_step_cell变量。
-
-完整的创建网络对象代码：
-
-```python
-from mindformers.experimental.parallel_core.pynative.optimizer import get_optimizer
-from mindformers.experimental.parallel_core.pynative.training import get_model
-from mindformers.experimental.parallel_core.pynative.training import TrainOneStepCell
-from mindformers.experimental.parallel_core.models import GPTModel
-
-
-def decay_filter(x):
-    return "norm" not in x.name.lower() and "bias" not in x.name.lower()
-
-
-def set_weight_decay(params, weight_decay=1e-1):
-    decay_params = list(filter(decay_filter, params))
-    other_params = list(filter(lambda x: not decay_filter(x), params))
-    group_params = []
-    if decay_params:
-        group_params.append({"params": decay_params, "weight_decay": weight_decay})
-    if other_params:
-        group_params.append({"params": other_params, "weight_decay": 0.0})
-    return group_params
-
-
-def model_provider_func(pre_process=True, post_process=True):
-    network_with_loss = GPTModel(
-        model_config, pre_process=pre_process, post_process=post_process
-    )
-    return network_with_loss
-
-network_with_loss = get_model(model_provider_func, training_config)
-
-group_params = set_weight_decay(network_with_loss.trainable_params(), optimizer_config.weight_decay)
-optimizer = get_optimizer(
-    optimizer_config,
-    training_config,
-    group_params,
-    network_with_loss,
-    grad_allreduce_op=training_config.loss_reduction
-)
-
-train_one_step_cell = TrainOneStepCell(network_with_loss, optimizer, None, training_config, model_config)
-```
-
-### 加载数据集及执行训练
-
-```python
-from dataset import get_dataset
-from mindformers.experimental.parallel_core.pynative.training import train
-
-train_dataset_iterator, val_dataset_iterator = get_dataset(dataset_config)
-train(
-    train_one_step_cell,
-    train_dataset_iterator,
-    training_config,
-    val_dataset_iterator,
-    metrics,
-    evaluation,
-)
-```
-
-### 运行训练脚本
-
-```bash
-bash pretrain_gpt.sh xx.yaml
-```
-
-若不指定xx.yaml，则默认为pretrain_gpt_7B.yaml。
-
-训练脚本`pretrain_gpt.sh`详细解析如下：
-
-#### 设置环境变量
-
-`HCCL_BUFFSIZE=200`设置两个NPU之间共享数据的缓存区大小为200M；`HCCL_EXEC_TIMEOUT=600`设置设备间执行时同步的等待时间为10分钟。`ASCEND_RT_VISIBLE_DEVICES`指定了可见的设备编号，这里设置为设备`0`号卡。
-
-```bash
-export HCCL_BUFFSIZE=200
-export HCCL_EXEC_TIMEOUT=600
-export ASCEND_RT_VISIBLE_DEVICES='0'
-```
-
-#### 设置端口号
-
-```bash
-port=8828
-```
-
-如果之前的配置异常退出，可以使用如下代码进行清理。
-
-```bash
-PIDS=$(sudo lsof -i :$port | awk 'NR>1 {print $2}')
-if [ -n "$PIDS" ]; then
-    for pid in $PIDS; do
-        kill -9 $pid
-        echo "Killed process $pid"
-    done
-else
-    echo "No processes found listening on port $port."
-fi
-```
-
-#### 设置日志存储路径
-
-获取当前脚本所在的目录路径并存储在`project_dir`变量中，同时设置日志路径变量`log_path="msrun_log"`。先删除名为`msrun_log`的目录（如果存在），然后重新创建这个目录。
-
-```bash
-project_dir=$(cd "$(dirname "$0")" || exit; pwd)
-log_path="msrun_log"
-
-rm -rf "${log_path}"
-mkdir "${log_path}"
-```
-
-#### 设置可用设备数量
-
-```bash
-# 计算设备数量
-IFS=',' read -r -a devices <<< "$ASCEND_RT_VISIBLE_DEVICES"
-work_num=${#devices[@]}
-```
-
-#### 获取配置文件
-
-尝试从命令行参数中获取配置文件路径，如果没有提供命令行参数，则使用默认的配置文件 "pretrain_gpt_7B.yaml"。
-
-```bash
-config_path=$1
-if [ -z "$config_path" ]; then
-    config_path="pretrain_gpt_7B.yaml"
-fi
-```
-
-#### 以msrun模式执行训练脚本
-
-```bash
-msrun --worker_num "$work_num" --local_worker_num="$work_num" --master_port=$port --log_dir="$log_path" --join=True --cluster_time_out=300 pretrain_gpt.py --config_path="${config_path}"
-```
-
-#### 运行结果
-
-接下来通过命令调用对应的脚本。
-
-```bash
-bash pretrain_gpt.sh
-```
-
-执行完后，日志文件保存到`output`目录下，其中部分文件目录结构如下：
-
-```text
-└─ output
-    └─ log
-        ├─ rank_0
-        |   ├─ info.log
-        |   └─ error.log
-        ├─ rank_1
-        |   ├─ info.log
-        |   └─ error.log
-    ...
-```
-
-关于Loss部分结果保存在`output/log/rank_*/info.log`中，示例如下：
-
-```text
-train: Epoch:0, Step:5, Loss: 10.341485, Finite_grads: True, Loss_scale: 4096.0, Learning_rate: (1.250000e-06,1.250000e-06,), Time: 1403.24 ms
-train: Epoch:0, Step:6, Loss: 10.38118, Finite_grads: True, Loss_scale: 4096.0, Learning_rate: (1.250000e-06,1.250000e-06,), Time: 1378.19 ms
-train: Epoch:0, Step:7, Loss: 10.165115, Finite_grads: True, Loss_scale: 4096.0, Learning_rate: (1.250000e-06,1.250000e-06,), Time: 1370.32 ms
-train: Epoch:0, Step:8, Loss: 10.039211, Finite_grads: True, Loss_scale: 4096.0, Learning_rate: (1.250000e-06,1.250000e-06,), Time: 1386.89 ms
-train: Epoch:0, Step:9, Loss: 10.040031, Finite_grads: True, Loss_scale: 4096.0, Learning_rate: (1.250000e-06,1.250000e-06,), Time: 1475.95 ms
-...
-```
diff --git a/docs/mindformers/docs/source_zh_cn/usage/quantization.md b/docs/mindformers/docs/source_zh_cn/usage/quantization.md
deleted file mode 100644
index 41f6869f9d1fc44a54ff9e59518b30a5dcdddf4c..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_zh_cn/usage/quantization.md
+++ /dev/null
@@ -1,19 +0,0 @@
-# 量化
-
-[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_zh_cn/usage/quantization.md)
-
-## 概述
-
-量化（Quantization）作为一种重要的大模型压缩技术，通过对模型中的浮点参数转为低精度的整数参数，实现对参数的压缩。随着模型的参数和规格不断增大，量化在模型部署中能有效减少模型存储空间和加载时间，提高模型的推理性能。
-
-MindSpore Transformers 集成 MindSpore Golden Stick 工具组件，提供统一量化推理流程，方便用户开箱即用。请参考 [MindSpore Golden Stick 安装教程](https://www.mindspore.cn/golden_stick/docs/zh-CN/r1.1.0/install.html)进行安装，并参考 [MindSpore Golden Stick 应用PTQ算法](https://www.mindspore.cn/golden_stick/docs/zh-CN/r1.1.0/ptq/ptq.html)对MindSpore Transformers中的模型进行量化。
-
-## 模型支持度
-
-当前仅支持以下模型，支持模型持续补充中。
-
-| 支持的模型                                                                                                                                |
-|--------------------------------------------------------------------------------------------------------------------------------------|
-| [DeepSeek-V3](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/deepseek3/deepseek3_671b/predict_deepseek3_671b.yaml)     |
-| [DeepSeek-R1](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/deepseek3/deepseek_r1_671b/predict_deepseek_r1_671b.yaml) |
-| [Llama2](https://gitee.com/mindspore/mindformers/blob/r1.5.0/configs/llama2/predict_llama2_13b_ptq.yaml)                             |
\ No newline at end of file
diff --git a/docs/mindformers/docs/source_zh_cn/usage/sft_tuning.md b/docs/mindformers/docs/source_zh_cn/usage/sft_tuning.md
deleted file mode 100644
index b1cae455a149e3e93ebb79278a7ced6c06b413db..0000000000000000000000000000000000000000
--- a/docs/mindformers/docs/source_zh_cn/usage/sft_tuning.md
+++ /dev/null
@@ -1,256 +0,0 @@
-# SFT微调
-
-[![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/r2.6.0/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/r2.6.0/docs/mindformers/docs/source_zh_cn/usage/sft_tuning.md)
-
-## 概述
-
-SFT（Supervised Fine-Tuning，监督微调）采用有监督学习思想，是指在预训练模型的基础上，通过调整部分或全部参数，使其更适应特定任务或数据集的过程。
-
-## SFT微调的基本流程
-
-SFT微调整体包含以下几个部分：
-
-- **预训练：**
-  首先需要在一个较大规模的数据集上训练一个神经网络模型，比如针对大语言模型，通常是在大量未标记的文本数据上进行，预训练阶段的目标是使模型获取通用的知识和理解能力。
-- **微调：**
-  结合目标任务，用新的训练数据集对已经得到的预训练模型进行微调。在微调过程中，通过反向传播可以对原始模型的全部参数或者部分参数进行优化，使模型在目标任务上取得更好的效果。
-- **评估：**
-  经过微调之后会得到一个新的模型，可以用目标任务的评测数据集对微调模型进行评估，从而得到微调模型在目标任务上的性能指标。
-
-结合实际操作，可以将SFT微调分解为以下步骤：
-
-1. **选择预训练模型：**
-   选择一个预训练的语言模型，如GPT-2、Llama2等。预训练模型通常是在大型文本语料库上进行过训练，以学习语言的通用表示。
-2. **下载模型权重：**
-   针对选择的预训练模型，可以从HuggingFace模型库中下载预训练的权重。
-3. **模型权重转换：**
-   结合自己所要使用的框架，对已经下载的HuggingFace权重进行权重转换，比如转换为MindSpore框架所支持的ckpt权重。
-4. **数据集准备：**
-   结合微调的目标，选择用于微调任务的数据集，针对大语言模型，微调数据集一般是包含文本和标签的数据，比如alpaca数据集。同时在使用数据集时，需要对数据做相应的预处理，比如使用MindSpore框架时，需要将数据集转换为MindRecord格式。
-5. **执行微调任务：**
-   使用微调任务的数据集对预训练模型进行训练，更新模型参数，如果是全参微调则会对所有参数进行更新，微调任务完成后，便可以得到新的模型。
-
-## SFT微调方式
-
-MindSpore Transformers当前支持全参微调和LoRA低参微调两种SFT微调方式。全参微调是指在训练过程中对所有参数进行更新，适用于大规模数据精调，能获得最优的任务适应能力，但需要的计算资源较大。LoRA低参微调在训练过程中仅更新部分参数，相比全参微调显存占用更少、训练速度更快，但在某些任务中的效果不如全参微调。
-
-### LoRA 原理简介
-
-LoRA通过将原始模型的权重矩阵分解为两个低秩矩阵来实现参数量的显著减少。例如，假设一个权重矩阵W的大小为m x n，通过LoRA，该矩阵被分解为两个低秩矩阵A和B，其中A的大小为m x r，B的大小为r x n（r远小于m和n）。在微调过程中，仅对这两个低秩矩阵进行更新，而不改变原始模型的其他部分。
-
-这种方法不仅大幅度降低了微调的计算开销，还保留了模型的原始性能，特别适用于数据量有限、计算资源受限的环境中进行模型优化，详细原理可以查看论文 [LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685) 。
-
-## 使用MindSpore Transformers进行全参微调
-
-### 选择预训练模型
-
-MindSpore Transformers目前已经支持业界主流大模型，该实践流程选择Llama2-7B模型SFT微调为例。
-
-### 下载模型权重
-
-MindSpore Transformers提供已经转换完成的预训练权重、词表文件用于预训练、微调和推理，用户也可以下载HuggingFace官方权重经过模型权重转换后进行使用。
-
-词表下载链接：[tokenizer.model](https://ascend-repo-modelzoo.obs.cn-east-2.myhuaweicloud.com/MindFormers/llama2/tokenizer.model)
-
-| 模型名称      |                                                 MindSpore权重                                                  |                                        HuggingFace权重                                        |
-|:----------|:------------------------------------------------------------------------------------------------------------:| :---------------------------------------------------------------------------------------------: |
-| Llama2-7B |  [Link](https://ascend-repo-modelzoo.obs.cn-east-2.myhuaweicloud.com/MindFormers/llama2/llama2_7b.ckpt)      | [Link](https://huggingface.co/meta-llama/Llama-2-7b-hf) |
-
-> Llama2的所有权重都需要通过向Meta[提交申请](https://ai.meta.com/resources/models-and-libraries/llama-downloads)来获取，如有需要请自行申请。
-
-### 模型权重转换
-
-以[Llama2-7B模型](https://huggingface.co/meta-llama/Llama-2-7b-hf/tree/main)为例，原始的HuggingFace权重文件主要包含：<br>
-
-- `config.json`：模型架构的主要配置信息<br>
-- `generation_config.json`：文本生成相关的配置信息<br>
-- `safetensors文件`：模型权重文件<br>
-- `model.safetensors.index.json`：safetensors模型参数文件索引和描述模型切片的json文件<br>
-- `bin文件`：pytorch的模型权重文件<br>
-- `pytorch_model.bin.index.json`：pytorch索引和描述模型切片的json文件<br>
-- `tokenizer.json`：分词器的词汇配置文件<br>
-- `tokenizer.model`：模型的分词器<br>
-
-MindSpore Transformers提供权重转换脚本，通过执行[convert_weight.py转换脚本](https://gitee.com/mindspore/mindformers/blob/r1.5.0/convert_weight.py)，可以将HuggingFace的权重转换为完整的ckpt权重。
-
-```bash
-python convert_weight.py --model llama --input_path TORCH_CKPT_DIR --output_path {path}/MS_CKPT_NAME
-```
-
-参数说明:
-
-```commandline
-model:       模型名称（其他模型请参考模型说明文档）
-input_path:  下载HuggingFace权重的文件夹路径
-output_path: 转换后的MindSpore权重文件保存路径
-```
-
-### 数据集准备
-
-MindSpore Transformers提供**WikiText2**作为预训练数据集，**alpaca**作为微调数据集。
-
-| 数据集名称     |                 适用模型                  |   适用阶段    |                                                                            下载链接                                                                            |
-|:----------|:-------------------------------------:|:---------:| :--------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-| alpaca    | Llama2-7B<br>Llama2-13B<br>Llama2-70B |    微调     |                   [Link](https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json)                   |
-
-以alpaca数据集为例，下载数据集后需要对数据集进行预处理。预处理中所用的`tokenizer.model`可以参考模型权重下载进行下载。
-
-**alpaca 数据预处理**
-
-1. 执行MindSpore Transformers中的[alpaca_converter.py脚本](https://gitee.com/mindspore/mindformers/blob/r1.5.0/mindformers/tools/dataset_preprocess/llama/alpaca_converter.py)，将数据集转换为多轮对话格式。
-
-    ```bash
-    python alpaca_converter.py \
-      --data_path /{path}/alpaca_data.json \
-      --output_path /{path}/alpaca-data-conversation.json
-    ```
-
-    参数说明：
-
-    ```commandline
-    data_path:   输入下载的文件路径
-    output_path: 输出文件的保存路径
-    ```
-
-2. 执行MindSpore Transformers中的[llama_preprocess.py脚本](https://gitee.com/mindspore/mindformers/blob/r1.5.0/mindformers/tools/dataset_preprocess/llama/llama_preprocess.py)，将数据转换为MindRecord格式。该操作依赖fastchat工具包解析prompt模板, 请提前安装fastchat >= 0.2.13。
-
-    ```bash
-    python llama_preprocess.py \
-      --dataset_type qa \
-      --input_glob /{path}/alpaca-data-conversation.json \
-      --model_file /{path}/tokenizer.model \
-      --seq_length 4096 \
-      --output_file /{path}/alpaca-fastchat4096.mindrecord
-    ```
-
-    参数说明：
-
-    ```commandline
-    dataset_type: 预处理数据类型
-    input_glob:   转换后的alpaca的文件路径
-    model_file:   模型tokenizer.model文件路径
-    seq_length:   输出数据的序列长度
-    output_file:  输出文件的保存路径
-    ```
-
-### 执行微调任务
-
-#### 单卡训练
-
-执行`run_mindformer.py`启动单卡的微调任务，下面提供了一个使用示例：
-
-以Llama2模型单卡微调为例，由于单卡显存有限，无法运行完整的Llama2-7B模型，所以缩层进行示例，修改`finetune_llama2_7b.yaml`，将其中`num_layers`设置为2。
-
-启动命令如下：
-
-```shell
-python run_mindformer.py \
- --config configs/llama2/finetune_llama2_7b.yaml \
- --train_dataset_dir /{path}/alpaca-fastchat4096.mindrecord \
- --load_checkpoint /{path}/llama2_7b.ckpt \
- --use_parallel False \
- --run_mode finetune
-```
-
-#### 单机训练
-
-以Llama2-7B为例，执行msrun启动脚本，进行8卡分布式训练，启动命令如下：
-
-```bash
-bash scripts/msrun_launcher.sh "run_mindformer.py \
- --config configs/llama2/finetune_llama2_7b.yaml \
- --load_checkpoint /{path}/llama2_7b.ckpt \
- --train_dataset_dir /{path}/alpaca-fastchat4096.mindrecord \
- --use_parallel True \
- --run_mode finetune" 8
-```
-
-参数说明：
-
-```commandline
-config：            模型的配置文件，文件在MindSpore Transformers代码仓中config目录下
-load_checkpoint：   checkpoint文件的路径
-train_dataset_dir： 训练数据集路径
-use_parallel：      是否开启并行
-run_mode：          运行模式，train：训练，finetune：微调，predict：推理
-```
-
-任务执行完成后，在mindformers/output目录下，会生成checkpoint文件夹，同时模型文件会保存在该文件夹下。
-
-#### 多机训练
-
-多机多卡微调任务与启动预训练类似，可参考[多机多卡的预训练命令](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/usage/pre_training.html#%E5%A4%9A%E6%9C%BA%E8%AE%AD%E7%BB%83)，并对命令进行如下修改：
-
-1. 增加启动脚本入参`--load_checkpoint /{path}/llama2_7b.ckpt`加载预训练权重。
-2. 设置启动脚本中的`--train_dataset_dir /{path}/alpaca-fastchat4096.mindrecord`加载微调数据集。
-3. 设置启动脚本中的`--run_mode finetune`，run_mode表示运行模式，train：训练，finetune：微调，predict：推理。
-
-任务执行完成后，在mindformers/output目录下，会生成checkpoint文件夹，同时模型文件会保存在该文件夹下。
-
-## 使用MindSpore Transformers进行LoRA低参微调
-
-MindSpore Transformers支持配置化使能LoRA微调，无需对每个模型进行代码适配，而仅需修改全参微调的YAML配置文件中的模型配置，添加 `pet_config` 低参微调配置，即可使用其进行LoRA低参微调任务。以下展示了Llama2模型LoRA微调的YAML配置文件中的模型配置部分，并对 `pet_config` 参数进行了详细说明。
-
-### 示例配置文件（YAML）
-
-完整的YAML配置文件可以通过以下链接访问：[Llama2 LoRA微调 YAML 文件](https://gitee.com/mindspore/mindformers/blob/r1.5.0/configs/llama2/lora_llama2_7b.yaml)。
-
-```yaml
-# model config
-model:
-  model_config:
-    type: LlamaConfig
-    batch_size: 1
-    seq_length: 4096
-    hidden_size: 4096
-    num_layers: 32
-    num_heads: 32
-    vocab_size: 32000
-    compute_dtype: "float16"
-    pet_config:
-      pet_type: lora
-      lora_rank: 16
-      lora_alpha: 16
-      lora_dropout: 0.05
-      target_modules: '.*wq|.*wk|.*wv|.*wo'
-  arch:
-    type: LlamaForCausalLM
-```
-
-### pet_config 参数详解
-
-在 model_config 中，pet_config 是LoRA微调的核心配置部分，用于指定LoRA的相关参数。具体参数说明如下：
-
-- **pet_type:** 指定参数高效微调技术（PET，Parameter-Efficient Tuning）的类型为LoRA。这意味着在模型的关键层中会插入LoRA模块，以减少微调时所需的参数量。
-- **lora_rank:** 定义了低秩矩阵的秩值。秩值越小，微调时需要更新的参数越少，从而减少计算资源的占用。这里设为16是一个常见的平衡点，在保持模型性能的同时，显著减少了参数量。
-- **lora_alpha:** 控制LoRA模块中权重更新的缩放比例。这个值决定了微调过程中，权重更新的幅度和影响程度。设为16表示缩放幅度适中，有助于稳定训练过程。
-- **lora_dropout:** 设置LoRA模块中的dropout概率。Dropout是一种正则化技术，用于减少过拟合风险。设置为0.05表示在训练过程中有5%的概率会随机“关闭”某些神经元连接，这在数据量有限的情况下尤为重要。
-- **target_modules:** 通过正则表达式指定LoRA将应用于模型中的哪些权重矩阵。在Llama中，这里的配置将LoRA应用于模型的自注意力机制中的Query（wq）、Key（wk）、Value（wv）和Output（wo）矩阵。这些矩阵在Transformer结构中扮演关键角色，插入LoRA后可以在减少参数量的同时保持模型性能。
-
-### Llama2-7B 的 LoRA 微调示例
-
-MindSpore Transformers 提供了 Llama2-7B 的 [LoRA 微调示例](https://gitee.com/mindspore/mindformers/blob/r1.5.0/docs/model_cards/llama2.md#lora%E5%BE%AE%E8%B0%83)。微调过程中使用的数据集可以参考[数据集下载](https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json)获得。
-
-以 Llama2-7B 为例，可以执行以下 msrun 启动脚本，进行 8 卡分布式微调。
-
-```shell
-bash scripts/msrun_launcher.sh "run_mindformer.py \
- --config configs/llama2/lora_llama2_7b.yaml \
- --train_dataset_dir /{path}/alpaca-fastchat4096.mindrecord \
- --load_checkpoint /{path}/llama2_7b.ckpt \
- --auto_trans_ckpt False \
- --use_parallel True \
- --run_mode finetune" 8
-```
-
-当权重的分布式策略和模型的分布式策略不一致时，需要对权重进行切分转换。加载权重路径应设置为以 `rank_0` 命名的目录的上一层路径，同时开启权重自动切分转换功能 `--auto_trans_ckpt True` 。关于分布式权重切分转换的场景和使用方式的更多说明请参考[分布式权重切分与合并](https://www.mindspore.cn/mindformers/docs/zh-CN/r1.5.0/function/transform_weight.html)。
-
-```shell
-bash scripts/msrun_launcher.sh "run_mindformer.py \
- --config configs/llama2/lora_llama2_7b.yaml \
- --train_dataset_dir /{path}/alpaca-fastchat4096.mindrecord \
- --load_checkpoint /{path}/checkpoint/ \
- --auto_trans_ckpt True \
- --use_parallel True \
- --run_mode finetune" 8
-```