From 22c573334d34a9e85ed361124c084608eab7c3da Mon Sep 17 00:00:00 2001 From: "Wu,Qiang-Roy" Date: Wed, 28 May 2025 16:44:28 +0800 Subject: [PATCH 1/7] add feature: install npu driver in docker container --- .../software/Ko-Files_24.1.RC3.json | 31 ++ .../software/Ko-Files_25.0.RC1.json | 31 ++ .../library/install_npu_by_container.py | 299 ++++++++++++++++++ .../install/install_npu_by_container.yml | 25 ++ ascend_deployer/scripts/pkg_utils.py | 5 + .../tools/npu_container/910A2/Dockerfile | 12 + .../tools/npu_container/910A2/davinci.conf | 1 + .../npu_container/910A2/dms_events_conf.lst | 217 +++++++++++++ .../tools/npu_container/910A2/entrypoint.sh | 4 + .../tools/npu_container/910A2/installation.py | 177 +++++++++++ .../tools/npu_container/Dockerfile | 5 + ascend_deployer/tools/npu_container/README.md | 9 + 12 files changed, 816 insertions(+) create mode 100644 ascend_deployer/downloader/software/Ko-Files_24.1.RC3.json create mode 100644 ascend_deployer/downloader/software/Ko-Files_25.0.RC1.json create mode 100644 ascend_deployer/library/install_npu_by_container.py create mode 100644 ascend_deployer/playbooks/install/install_npu_by_container.yml create mode 100644 ascend_deployer/tools/npu_container/910A2/Dockerfile create mode 100644 ascend_deployer/tools/npu_container/910A2/davinci.conf create mode 100644 ascend_deployer/tools/npu_container/910A2/dms_events_conf.lst create mode 100644 ascend_deployer/tools/npu_container/910A2/entrypoint.sh create mode 100644 ascend_deployer/tools/npu_container/910A2/installation.py create mode 100644 ascend_deployer/tools/npu_container/Dockerfile create mode 100644 ascend_deployer/tools/npu_container/README.md diff --git a/ascend_deployer/downloader/software/Ko-Files_24.1.RC3.json b/ascend_deployer/downloader/software/Ko-Files_24.1.RC3.json new file mode 100644 index 00000000..2a809feb --- /dev/null +++ b/ascend_deployer/downloader/software/Ko-Files_24.1.RC3.json @@ -0,0 +1,31 @@ +{ + "name": "Ko-Files", + "version": "24.1.RC1", + "default": false, + "other": [ + { + "filename": "openeuler-with-kmod-22.03-lts-aarch64.tar.gz", + "url": "https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/images/os/openeuler-with-kmod-22.03-lts-aarch64.tar.gz", + "sha256": "935e00ad4c0de79a31c0e0212e94fc6e7a68fb4896b125ec63c027e90689b01b", + "dest": "resources/npu_container" + }, + { + "filename": "openeuler-with-kmod-22.03-lts-x86_64.tar.gz", + "url": "https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/images/os/openeuler-with-kmod-22.03-lts-x86_64.tar.gz", + "sha256": "0055a5e721460bbe8757cdf4ee0fa256eb43daf24e467c52a4e2aa4fe56206b0", + "dest": "resources/npu_container" + }, + { + "filename": "OpenEuler_22.03LTS_aarch64_ko_files_24.1.RC1.zip", + "url": "https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Ascend%20HDK/Ascend%20HDK%2024.1.0/A300-3000-npu_24.1.0_linux-aarch64.zip", + "sha256": "4dc7f0181ea069539d6a52d2f613e4478dcab084f1300f07ec54d8f79715f2ab", + "dest": "resources/npu_container" + }, + { + "filename": "OpenEuler_22.03LTS_x86_64_ko_files_24.1.RC1.zip", + "url": "https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Ascend%20HDK/Ascend%20HDK%2024.1.0/A300-3000-npu_24.1.0_linux-aarch64.zip", + "sha256": "4dc7f0181ea069539d6a52d2f613e4478dcab084f1300f07ec54d8f79715f2ab", + "dest": "resources/npu_container" + } + ] + } \ No newline at end of file diff --git a/ascend_deployer/downloader/software/Ko-Files_25.0.RC1.json b/ascend_deployer/downloader/software/Ko-Files_25.0.RC1.json new file mode 100644 index 00000000..368ac070 --- /dev/null +++ b/ascend_deployer/downloader/software/Ko-Files_25.0.RC1.json @@ -0,0 +1,31 @@ +{ + "name": "Ko-Files", + "version": "25.0.RC1", + "default": true, + "other": [ + { + "filename": "openeuler-with-kmod-22.03-lts-aarch64.tar.gz", + "url": "https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/images/os/openeuler-with-kmod-22.03-lts-aarch64.tar.gz", + "sha256": "935e00ad4c0de79a31c0e0212e94fc6e7a68fb4896b125ec63c027e90689b01b", + "dest": "resources/npu_container" + }, + { + "filename": "openeuler-with-kmod-22.03-lts-x86_64.tar.gz", + "url": "https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/images/os/openeuler-with-kmod-22.03-lts-x86_64.tar.gz", + "sha256": "0055a5e721460bbe8757cdf4ee0fa256eb43daf24e467c52a4e2aa4fe56206b0", + "dest": "resources/npu_container" + }, + { + "filename": "OpenEuler_22.03LTS_aarch64_ko_files_25.0.RC1.zip", + "url": "https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Ascend%20HDK/Ascend%20HDK%2024.1.0/A300-3000-npu_24.1.0_linux-aarch64.zip", + "sha256": "4dc7f0181ea069539d6a52d2f613e4478dcab084f1300f07ec54d8f79715f2ab", + "dest": "resources/npu_container" + }, + { + "filename": "OpenEuler_22.03LTS_x86_64_ko_files_25.0.RC1.zip", + "url": "https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Ascend%20HDK/Ascend%20HDK%2024.1.0/A300-3000-npu_24.1.0_linux-aarch64.zip", + "sha256": "4dc7f0181ea069539d6a52d2f613e4478dcab084f1300f07ec54d8f79715f2ab", + "dest": "resources/npu_container" + } + ] + } \ No newline at end of file diff --git a/ascend_deployer/library/install_npu_by_container.py b/ascend_deployer/library/install_npu_by_container.py new file mode 100644 index 00000000..5c0ac17a --- /dev/null +++ b/ascend_deployer/library/install_npu_by_container.py @@ -0,0 +1,299 @@ +#!/usr/bin/env python3 +# coding: utf-8 +# Copyright 2025 Huawei Technologies Co., Ltd +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# =========================================================================== + +from datetime import datetime +import glob +import os +from pathlib import Path +import platform +import time + +from utils import ROOT_PATH +from ansible.module_utils.basic import AnsibleModule +from ansible.module_utils.check_utils import CheckUtil +from ansible.module_utils import common_info +from ansible.module_utils.common_info import NPUCardName, get_os_and_arch, ARCH + + +class Base: + + step_install = "install" + step_build = "build" + + def __init__(self): + self.module = AnsibleModule( + argument_spec=dict( + resource_dir=dict(type="str", required=True), + step=dict(type="str", required=True), + image_name=dict(type="str", required=False), + ) + ) + self.facts = dict() + self.step = self.module.params["step"] + self.image_name = self.module.params.get("image_name") + self.npu_name = self._get_npu_name() + self.npu_container = os.path.join(ROOT_PATH, "resources/npu_container") + self.docker_dir = os.path.join(ROOT_PATH, "tools/npu_container/{}".format(self.npu_name)) + self.needed_commands = ["docker", "unzip", "tar"] + self._validator() + + @staticmethod + def _get_npu_name(): + npu = CheckUtil.get_card() + if npu.endswith("b"): + return NPUCardName.A910A2 + if npu.endswith("93"): + return NPUCardName.A910A3 + if npu.endswith("910"): + return NPUCardName.A910A1 + return npu + + def _validator(self): + if not Path(self.docker_dir).exists(): + self.module.fail_json( + changed=False, + rc=1, + msg="[ASCEND][[ERROR]]check the docker dir failed: {} is not existed.".format(self.docker_dir) + ) + for cmd in self.needed_commands: + if not self.module.get_bin_path(cmd): + self.module.fail_json( + changed=False, + rc=1, + msg="[ASCEND][[ERROR]]check the required command failed: {} is not existed.".format(self.cmd) + ) + + def run(self, cmd): + return_code, out, err = self.module.run_command(cmd) + output = out + err + if return_code != 0: + self.module.fail_json( + changed=False, + rc=1, + msg="[ASCEND][[ERROR]]run command: {} failed, output: {}".format(cmd, output) + ) + return output + + +class ProcessNPUDriver(Base): + """ + A class to handle the NPU driver directory. + """ + + def __init__(self): + super(ProcessNPUDriver, self).__init__() + self.npu_info = common_info.get_npu_info() + self.resource_dir = os.path.expanduser(self.module.params["resource_dir"]) + self.driver_parent_dir = os.path.join(ROOT_PATH, "/resources/npu") + self.driver_file_path = None + self.messages = [] + + def _find_files(self, path, pattern): + self.messages.append("try to find {} for {}".format(path, pattern)) + matched_files = glob.glob(os.path.join(path, pattern)) + self.messages.append("find files: " + ",".join(matched_files)) + if len(matched_files) > 0: + return matched_files[0] + return "" + + def _find_npu_files(self): + arch = ARCH + if arch == "x86_64": + arch = "x86?64" # old package mix x86-64 and x86_64 + uniform_npu_scene = npu_scene = self.npu_info.get("scene") + # uniform package has higher priority + uni_package_path = common_info.get_scene_dict(os.path.expanduser(self.resource_dir)).get(uniform_npu_scene) + if uni_package_path: + driver_file_path = self._find_files(uni_package_path, r"*npu-driver*linux*%s*.run" % arch) + self.driver_file_path = driver_file_path or self.driver_file_path + return + + def extract_npu_driver(self): + self._find_npu_files() + if not self.driver_file_path: + self.module.fail_json( + changed=False, + rc=1, + msg="[ASCEND][[ERROR]]Do not find npu driver run file in {}.".format(self.resource_dir) + ) + driver_dir = os.path.join(self.driver_parent_dir, "npu_driver") + command = "bash {} --noexec --extract={}".format( + self.driver_file_path, driver_dir) + self.run(command) + return driver_dir + + +class ImageController(Base): + + openeuler_with_kmod = "openeuler-with-kmod:22.03-lts" + + def __init__(self): + super(ImageController, self).__init__() + self.version = datetime.now().strftime("%Y%m%d%H%M") + + def _install_base_image(self, base_image_name): + command = "docker load -i {}/{}".format(self.npu_container, base_image_name) + self.run(command) + + def build_image(self): + image_name = "npu_driver_image:v{}".format(self.version) + self._install_base_image(self.openeuler_with_kmod) + command = "docker build -t {} .".format(image_name) + self.run(command) + command = "docker save {} -o {}/{}".format(image_name, self.npu_container, image_name) + self.run(command) + return image_name + + def load_image(self): + if not self.image_name: + self.module.fail_json( + changed=False, + rc=1, + msg="[ASCEND][[ERROR]]Input image name is empty." + ) + image_file = os.path.join(self.npu_container, self.image_name) + command = "docker load -i {}".format(image_file) + self.run(command) + + def run_container(self, image_name): + container_name = "npu_driver_container_{}".format(self.version) + command = "docker run " \ + "--privileged " \ + "--it -d " \ + "-v /lib:/lib " \ + "-v /usr/local:/usr/local " \ + "-v /etc:/etc " \ + "--name {} {}".format(container_name, image_name) + self.run(command) + return container_name + + def destroy_container(self, container_name): + command = "docker rm -f {}".format(container_name) + self.run(command) + + +class InstallNPUDriverByContainer(Base): + """ + A class to handle the installation of NPU drivers using Docker containers. + """ + + def __init__(self): + super(InstallNPUDriverByContainer, self).__init__() + self.os_and_arch = get_os_and_arch() + self.image_controller = ImageController() + self.npu_driver = ProcessNPUDriver() + self.kernel_version = platform.uname().release + self.env_sh = Path("/etc/profile.d/ascend.sh") + self._validator() + + # def _unzip_ko_files(self): + # ko_files_zip = os.path.join(self.ko_files_dir, "{}_ko_files.zip".format(self.os_and_arch)) + # command = "unzip -q {}".format(ko_files_zip) + # self.run(command) + + def _find_ko_files(self): + """ + - /resources/ko_files/{}_ko_files + - 4.19.90 + - 5.10.0 + """ + folder = "{}_ko_files".format(self.os_and_arch) + unzipped_ko_files_dir = os.path.join(self.npu_container, folder) + if not Path(unzipped_ko_files_dir).exists(): + self.module.fail_json( + changed=False, + rc=1, + msg="[ASCEND][[ERROR]]folder({}} does not exist.".format(folder) + ) + # process kernel + # kernel example: 4.19.90-vhulk2111.1.0.h963.eulerosv2r10.aarch64, 5.10.0-60.18.0.50.oe2203.aarch64 + # process it as: 4.19.90, 5.10.0 + # split string with "-", get the first element + simple_kernel = self.kernel_version.split("-")[0] + all_existed_version = [ + i for i in os.listdir(unzipped_ko_files_dir) if os.path.isdir(os.path.join(unzipped_ko_files_dir, i)) + ] + if not all_existed_version: + self.module.fail_json( + changed=False, + rc=1, + msg="[ASCEND][[ERROR]]no ko files is found in {}.".format(unzipped_ko_files_dir) + ) + + origin_ko_files_dir = os.path.join(unzipped_ko_files_dir, all_existed_version[0]) + if simple_kernel in all_existed_version: + origin_ko_files_dir = os.path.join(unzipped_ko_files_dir, simple_kernel) + + return origin_ko_files_dir + + def _copy_resources(self): + driver_dir = self.npu_driver.extract_npu_driver() + # self._unzip_ko_files() + origin_ko_files_dir = self._find_ko_files() + commands = [ + "cp -r {}/driver {} ".format(driver_dir, self.docker_dir), + "mkdir -p {}/ko_files && cp {}/* {}/ko_files".format(self.docker_dir, origin_ko_files_dir, self.docker_dir) + ] + for cmd in commands: + self.run(cmd) + + def _query_result(self, container_name): + command = "bash {}".format(str(self.env_sh)) + max_retry = 10 + for i in range(max_retry): + if not Path(self.env_sh).exists(): + time.sleep((i+1) ** 2) + continue + self.run(command) + if not self.module.get_bin_path("npu-smi"): + self.module.fail_json( + changed=False, + rc=1, + msg="[ASCEND][[ERROR]]npu-smi command not found, " \ + "please check the log of the container by 'docker logs {}'".format(container_name) + ) + + def execute(self): + """ + Main method to install the NPU driver by running a Docker container. + """ + result = {} + if self.step == self.step_build: + self._copy_resources + image_name = self.image_controller.build_image() + self.facts["image_name"] = image_name + self.module.exist_json( + changed=True, + msg="build {} successfully.".format(image_name), + ansible_facts=self.facts, + ) + elif self.step == self.step_install: + container_name = self.image_controller.run_container() + self._query_result(container_name) + self.image_controller.destroy_container(container_name) + else: + self.module.fail_json( + changed=False, + rc=1, + msg="[ASCEND][[ERROR]]Invalid step: {}".format(self.step) + ) + + +if __name__ == "__main__": + installer = InstallNPUDriverByContainer() + installer.execute() diff --git a/ascend_deployer/playbooks/install/install_npu_by_container.yml b/ascend_deployer/playbooks/install/install_npu_by_container.yml new file mode 100644 index 00000000..e32c9837 --- /dev/null +++ b/ascend_deployer/playbooks/install/install_npu_by_container.yml @@ -0,0 +1,25 @@ +- hosts: + - master[0] + - other_build_image + name: build image for npu driver + tasks: + - name: build npu driver image + install_npu_by_container: + resources_dir: "{{ resource_path }}" + step: 'build' + + - name: fetch noded image + scp: + ip: "{{ inventory_hostname }}" + port: "{{ansible_ssh_port|default('22')}}" + remote_user: "{{ ansible_ssh_user }}" + passwd: "{{ ansible_ssh_pass|default('') }}" + src: "{{ resource_path }}/npu_container/{{ image_name }}" + dest: "{{ resource_path }}/npu_container/" + fetch: 'true' + delegate_to: localhost + + - name: install npu driver by container + install_npu_by_container: + step: 'install' + image_name: "{{ image_name }}" \ No newline at end of file diff --git a/ascend_deployer/scripts/pkg_utils.py b/ascend_deployer/scripts/pkg_utils.py index d93c1ffe..e527a4e8 100644 --- a/ascend_deployer/scripts/pkg_utils.py +++ b/ascend_deployer/scripts/pkg_utils.py @@ -215,6 +215,11 @@ tags_map = { 'name_keywords': ['mindie-image'], 'path_keywords': ['MindIE-image', ], }, + 'npu_by_container': { + 'need_nexus': False, + 'name_keywords': ['ko_files'], + 'path_keywords': ['npu_container'], + }, } pkg_run_paths = ( diff --git a/ascend_deployer/tools/npu_container/910A2/Dockerfile b/ascend_deployer/tools/npu_container/910A2/Dockerfile new file mode 100644 index 00000000..a2673a3a --- /dev/null +++ b/ascend_deployer/tools/npu_container/910A2/Dockerfile @@ -0,0 +1,12 @@ +FROM openeuler-with-kmod:22.03-lts + +WORKDIR /app + +COPY davinci.conf . +COPY dms_events_conf.lst . +COPY ./driver /app/driver +COPY ./ko_files /app/ko_files +COPY installation.py . +COPY entrypoint.sh . + +CMD ["/app/entrypoint.sh"] \ No newline at end of file diff --git a/ascend_deployer/tools/npu_container/910A2/davinci.conf b/ascend_deployer/tools/npu_container/910A2/davinci.conf new file mode 100644 index 00000000..41c033a3 --- /dev/null +++ b/ascend_deployer/tools/npu_container/910A2/davinci.conf @@ -0,0 +1 @@ +DAVINCI_HOME_PATH=/usr/local/Ascend \ No newline at end of file diff --git a/ascend_deployer/tools/npu_container/910A2/dms_events_conf.lst b/ascend_deployer/tools/npu_container/910A2/dms_events_conf.lst new file mode 100644 index 00000000..899d83bf --- /dev/null +++ b/ascend_deployer/tools/npu_container/910A2/dms_events_conf.lst @@ -0,0 +1,217 @@ +# event_code serverity + +# H2D heartbeat +0x40F84E00 3 + +# memory +0x80E01805 0 +0x80E18400 1 +0x80E01801 3 +0x80E18401 0 +0x80E18402 3 +0x80F38006 0 +0x80F38008 2 +0x80F38003 0 +0x80F2180D 2 +0x80E18006 0 +0x80E18005 1 +0x80E18008 2 +0x80E1800A 2 +0x80E18009 2 +0x80E01809 0 +0x80E00209 1 +0x80E0020B 3 +0x80E18000 1 +0x80E1800F 0 + +# SOC & L3D L3T CPU core +0x80A18006 0 +0x80A18005 1 +0x80A18008 2 +0x80A38006 0 +0x80A38008 2 +0x80A38003 0 +0x80A58006 0 +0x80A58008 2 +0x80A58003 0 +0x8C2FA009 0 +0x80CD8809 3 +0x80818C00 1 + +# PM/IAM +0x8C084E00 1 +0x8C0E4E00 1 +0x8C0A4E00 1 +0x8C104E00 1 +0x8C0C4E00 1 +0x8C204E00 2 +0x8C124E00 1 +0x8C044E00 1 +0x8C064E00 1 +0x8C03A000 2 +0x8C2FA001 2 +0x8C464E00 1 + +# RTS +0x80C98008 1 +0x80C98002 1 +0x80C98003 1 +0x80C98009 1 +0x80C98007 0 +0x80F78006 0 +0x80C98006 0 +0x80C78008 2 +0x80F78009 0 +0x80F78003 2 +0x80F78008 2 +0x80FA4E00 2 +0x80CD8006 0 +0x80CD8008 2 +0x80CD8003 1 +0x80FB8000 0 +0x812E4E00 2 +0x80CF8000 2 +0x80F78C02 2 +0x80F78C03 2 +0x80F78C04 2 +0x80CB800A 1 +0x80C9800A 1 +0x80818C05 2 +0x80818C06 0 +0x80C98001 2 +0x80CB8001 2 + +#DSA +0x81318006 0 +0x81318008 1 + +#TS Subsys disp +0x81338006 0 +0x81338008 2 +0x81338002 2 +0x81338004 0 + +#DSA Subsys disp +0x81938006 0 +0x81938004 0 +0x81938002 2 +0x81938008 2 + +#AIC Subsys disp +0x813B8006 0 +0x813B8008 2 +0x813B8002 2 +0x813B8004 0 + +# LPM +0x80E24E00 2 +0x80E20207 2 +0x80E3A201 2 +0x80E3A203 2 +0x80E39200 2 +0x80E21007 1 +0x80E38008 2 +0x80E38003 1 +0x80E21E01 2 +0x80E21008 2 + +# PCIe +0x80B98000 2 +0x80B98006 0 +0x80B98008 2 + +# PCIe DISP +0x81978002 2 +0x81978004 0 +0x81978006 0 +0x81978008 2 + +# Network +0x80BD8008 2 +0x80BD8000 2 +0x80BD8003 2 +0x80BD8009 2 +0x80BB8008 2 +0x80BB8009 2 +0x80BB8000 2 +0x80BB8003 2 +0x80BB800A 2 +0x81AB8003 2 +0x81AB8008 2 +0x81AB800C 2 +0x81078605 2 +0x81AD8605 2 +0x81078603 1 +0x81078607 1 +0x8C1F8608 1 +0x4C1F8608 1 + +#NIC Subsys disp +0x81958006 0 +0x81958004 0 +0x81958008 2 +0x81958002 2 + +#aicpu +0x8C1FA006 2 +0x8C17A005 1 +0x8C1DA005 1 +0x8C19A005 1 + +#TEEDrv +0x80E78000 2 +0x80E78008 2 + +#HSM +0x80E58E03 2 +0x80E58E02 2 + +#HCCS +0x819B8003 0 +0x819B8006 0 +0x819B8605 1 +0x819B800D 1 +0x819B800A 2 + +#DVPP AA/SMMU +0x814D8006 0 +0x814D8008 0 +0x814D8004 3 +0x81498004 2 + +#DVPP +0x80D38009 1 +0x80D58000 1 +0x80D58009 1 +0x80D98008 1 +0x80DD8000 1 +0x80DD8003 1 +0x80DD8008 1 +0x80DD8007 0 + +#DVPP Subsys disp +0x81478002 2 +0x81478004 0 +0x81478006 0 +0x81478008 2 + +#PERI subsys disp +0x815F8002 2 +0x815F8004 0 +0x815F8006 0 +0x815F8008 2 + +#SLLC +0x80B78000 2 +0x80B78005 1 +0x80B78006 0 + +#SIOE +0x80B58000 2 + +#HCCS LINK +0x81A3880C 0 + +#TLS +0x80818200 1 +0x80818201 2 \ No newline at end of file diff --git a/ascend_deployer/tools/npu_container/910A2/entrypoint.sh b/ascend_deployer/tools/npu_container/910A2/entrypoint.sh new file mode 100644 index 00000000..b13048ec --- /dev/null +++ b/ascend_deployer/tools/npu_container/910A2/entrypoint.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +python3 /app/installation.py +tail -f /dev/null \ No newline at end of file diff --git a/ascend_deployer/tools/npu_container/910A2/installation.py b/ascend_deployer/tools/npu_container/910A2/installation.py new file mode 100644 index 00000000..250d2551 --- /dev/null +++ b/ascend_deployer/tools/npu_container/910A2/installation.py @@ -0,0 +1,177 @@ +#!/usr/bin/env python3 +# coding: utf-8 +# Copyright 2025 Huawei Technologies Co., Ltd +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# =========================================================================== + +import glob +import os +from pathlib import Path +import re +import subprocess +from typing import List + + +class Commands: + """ + A class to handle shell commands execution. + """ + @staticmethod + def run(command: List[str]) -> None: + """ + Execute a shell command. + :param command: The command to execute. + :return: The output of the command. + """ + result = subprocess.run(command, shell=True, check=True) + if result.returncode != 0: + raise RuntimeError( + f"Command '{command}' failed with return code {result.returncode}. " + f"Output: {result.stdout.strip()}, Error: {result.stderr.strip() if result.stderr else ''}" + ) + + +class Installation: + """ + A class to handle the installation of NPU drivers in a container environment. + """ + + def __init__(self): + self.kernel_version = os.uname().release + self.ko_folder = f"/lib/modules/{self.kernel_version}/npu_driver" + + + def _setup(self): + """ + Setup the installation environment. + """ + commands = [ + ["mkdir", "/usr/local/Ascend"], + ["mkdir", "-p", "/user/local/Ascend/driver/lib64/common"], + ["mkdir", "-p", self.ko_folder], + ] + for command in commands: + Commands().run(command) + print("Setup completed successfully.") + + def _copy_resources(self): + """ + Copy necessary resources for installation. + """ + commands = [ + ["cp", "-r", "/app/driver /usr/local/Ascend/"], + ["cp", "-r", "/app/davinci.conf /lib/"], + ["cp", "-r", "/app/dms_events.conf", "/etc/"], + ["cp", "/usr/local/Ascend/driver/lib64/*.so", "/usr/local/Ascend/driver/lib64/common"], + ["cp", "/app/ko_files/*.ko", self.ko_folder], + ] + for command in commands: + Commands.run(command) + print("Resources copied successfully.") + + def _update_permissions(self): + """ + Update permissions for the copied resources. + """ + commands = [ + ["chmod", "777", "/lib/davinci.conf"], + ["chmod", "777", "/etc/dms_events.conf"] + ] + for command in commands: + Commands.run(command) + print("Permission updated successfully.") + + def _extract_array(self, content: str, key: str) -> List[str]: + """ + Extract an array of strings from a given content based on a key. + :param content: The content to search in. + :param key: The key to search for. + :return: A list of strings extracted from the content. + """ + pattern = rf"{key}=\((.*?)\)" + match = re.search(pattern, content, re.DOTALL) + if not match: + raise ValueError(f"Key '{key}' not found in content.") + return match.group(1).strip().split() + + def _update_specific_func(self): + target_dir = "/usr/local/Ascend/driver/device" + specific_func_file = "/app/driver/script/specific_func.inc" + + with open(specific_func_file, 'r', encoding="utf-8") as file: + content = file.read() + + src_names = self._extract_array(content, "src_names") + dst_names = self._extract_array(content, "dst_names") + + if len(src_names) != len(dst_names): + raise ValueError( + "The number of source names and destination names do not match.") + for (i, src) in enumerate(src_names): + src_file = f"{target_dir}/{src}" + dst_file = f"{target_dir}/{dst_names[i]}" + + if not glob.glob(src_file): + raise FileNotFoundError( + f"Source file '{src_file}' does not exist.") + + Commands.run(["mv", src_file, dst_file]) + print(f"Renaming {src_file} to {dst_file} successfully.") + + def _install_ko(self): + """ + Install kernel objects (ko files). + """ + for ko_file in Path(self.ko_folder).glob("*.ko"): + if ko_file.is_file(): + module_name = ko_file.stem + print(f"Installing kernel object: {ko_file.name}") + Commands.run(["modprobe", module_name]) + print(f"Installed kernel object: {ko_file.name}") + + def _configure_env(self): + """ + Configure the environment for the installation. + """ + env_path = Path("/etc/profile.d/ascend.sh") + env_content = ( + "export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/common:" + "/usr/local/Ascend/driver/lib64/driver:$LD_LIBRARY_PATH\n" + ) + + try: + with env_path.open("w", encoding="utf-8") as env_file: + env_file.write(env_content) + Commands.run(["bash", str(env_path)]) + print("Environment configured successfully.") + except Exception as e: + raise RuntimeError(f"Failed to configure environment: {e}") from e + + def install(self): + """ + Main installation method that orchestrates the setup, copying of resources, + updating permissions, installing kernel objects, and configuring the environment. + """ + self._setup() + self._copy_resources() + self._update_permissions() + self._update_specific_func() + self._install_ko() + self._configure_env() + + +if __name__ == "__main__": + installer = Installation() + installer.install() diff --git a/ascend_deployer/tools/npu_container/Dockerfile b/ascend_deployer/tools/npu_container/Dockerfile new file mode 100644 index 00000000..c9df8dd5 --- /dev/null +++ b/ascend_deployer/tools/npu_container/Dockerfile @@ -0,0 +1,5 @@ +FROM openeuler/openeuler:22.03-lts + +RUN yum update && yum install -y kmod + +CMD ["/bin/bash"] \ No newline at end of file diff --git a/ascend_deployer/tools/npu_container/README.md b/ascend_deployer/tools/npu_container/README.md new file mode 100644 index 00000000..b6dbf94e --- /dev/null +++ b/ascend_deployer/tools/npu_container/README.md @@ -0,0 +1,9 @@ +## How to build openeuler-with-kmod:22.03-lts + +### Command + +`docker build -t openeuler-with-kmod::22.03 .` + +### Dependency + +Please make sure the openeuler/openeuler:22.03-lts is existed. \ No newline at end of file -- Gitee From ca80ff0c1aa81ebbee001f141545ad184539706c Mon Sep 17 00:00:00 2001 From: WuQiang-Roy Date: Wed, 4 Jun 2025 09:55:51 +0800 Subject: [PATCH 2/7] support install --- ascend_deployer/playbooks/process/process_install.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ascend_deployer/playbooks/process/process_install.yml b/ascend_deployer/playbooks/process/process_install.yml index 83470ce7..e2328051 100644 --- a/ascend_deployer/playbooks/process/process_install.yml +++ b/ascend_deployer/playbooks/process/process_install.yml @@ -18,6 +18,10 @@ import_playbook: ../install/install_npu.yml tags: driver,firmware,npu +- name: install npu driver by container + import_playbool: ../install/install_npu_by_container + tags: npu_by_container + - name: install atlasedge import_playbook: ../install/install_atlasedge.yml tags: atlasedge -- Gitee From 25dedb206adede2283a4dece16c525296580c200 Mon Sep 17 00:00:00 2001 From: xuchuan19 Date: Wed, 4 Jun 2025 10:03:32 +0800 Subject: [PATCH 3/7] =?UTF-8?q?=E9=80=82=E9=85=8Dnpu=E5=AE=B9=E5=99=A8?= =?UTF-8?q?=E5=8C=96=E5=AE=89=E8=A3=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ascend_deployer/downloader/other_downloader.py | 2 +- .../downloader/software/KO-Files_25.0.RC1.json | 13 +++++++++++++ ascend_deployer/downloader/software_mgr.py | 1 + ascend_deployer/scripts/pkg_utils.py | 5 +++++ 4 files changed, 20 insertions(+), 1 deletion(-) create mode 100644 ascend_deployer/downloader/software/KO-Files_25.0.RC1.json diff --git a/ascend_deployer/downloader/other_downloader.py b/ascend_deployer/downloader/other_downloader.py index bad6725a..50160766 100644 --- a/ascend_deployer/downloader/other_downloader.py +++ b/ascend_deployer/downloader/other_downloader.py @@ -84,7 +84,7 @@ class OtherDownloader: other_pkgs = [pkg for pkg in other_pkgs if "tfplugin" not in pkg.filename] download_dir = os.path.join(self._base_dir, "resources", "{0}_{1}".format(soft_ver.name, soft_ver.version)) self._mk_download_dir(other_pkgs, download_dir, soft_ver) - if soft_ver.name in ("CANN", "NPU", "FaultDiag", "MindIE-image"): + if soft_ver.name in ("CANN", "NPU", "FaultDiag", "MindIE-image", "KO-Files"): results = self._collect_pkgs_by_arch(arch, download_dir, self._base_dir, other_pkgs) else: results = [] diff --git a/ascend_deployer/downloader/software/KO-Files_25.0.RC1.json b/ascend_deployer/downloader/software/KO-Files_25.0.RC1.json new file mode 100644 index 00000000..2c68f109 --- /dev/null +++ b/ascend_deployer/downloader/software/KO-Files_25.0.RC1.json @@ -0,0 +1,13 @@ +{ + "name": "KO-Files", + "version": "25.0.RC1", + "default": true, + "other": [ + { + "filename": "openeuler-with-kmod-22.03-lts-aarch64.tar.gz", + "url": "https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/images/os/openeuler-with-kmod-22.03-lts-aarch64.tar.gz", + "sha256": "935e00ad4c0de79a31c0e0212e94fc6e7a68fb4896b125ec63c027e90689b01b", + "dest": "resources/npu_container" + } + ] +} \ No newline at end of file diff --git a/ascend_deployer/downloader/software_mgr.py b/ascend_deployer/downloader/software_mgr.py index f0144369..f58fbc4a 100644 --- a/ascend_deployer/downloader/software_mgr.py +++ b/ascend_deployer/downloader/software_mgr.py @@ -170,6 +170,7 @@ class SoftwareMgr: :param in: version 软件版本 :return: 安装软件name所需要下载的其他内容列表 """ + # print(f"44444444444444444444444----{self.other_software_list}") for soft in self.other_software_list: if soft.name.lower() == name.lower() and (version is None or soft.version == version): return soft.other diff --git a/ascend_deployer/scripts/pkg_utils.py b/ascend_deployer/scripts/pkg_utils.py index d93c1ffe..a30eabfe 100644 --- a/ascend_deployer/scripts/pkg_utils.py +++ b/ascend_deployer/scripts/pkg_utils.py @@ -215,6 +215,11 @@ tags_map = { 'name_keywords': ['mindie-image'], 'path_keywords': ['MindIE-image', ], }, + 'npu_by_container': { + 'need_nexus': False, + 'name_keywords': [], + 'path_keywords': ['npu_container', ], + }, } pkg_run_paths = ( -- Gitee From 4c5c6c1ee32d6f18271a4c0cd53b1099f7df5272 Mon Sep 17 00:00:00 2001 From: xuchuan19 Date: Wed, 4 Jun 2025 10:05:50 +0800 Subject: [PATCH 4/7] =?UTF-8?q?=E9=80=82=E9=85=8Dnpu=E5=AE=B9=E5=99=A8?= =?UTF-8?q?=E5=8C=96=E5=AE=89=E8=A3=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ascend_deployer/downloader/software_mgr.py | 1 - ascend_deployer/scripts/pkg_utils.py | 7 +------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/ascend_deployer/downloader/software_mgr.py b/ascend_deployer/downloader/software_mgr.py index f58fbc4a..f0144369 100644 --- a/ascend_deployer/downloader/software_mgr.py +++ b/ascend_deployer/downloader/software_mgr.py @@ -170,7 +170,6 @@ class SoftwareMgr: :param in: version 软件版本 :return: 安装软件name所需要下载的其他内容列表 """ - # print(f"44444444444444444444444----{self.other_software_list}") for soft in self.other_software_list: if soft.name.lower() == name.lower() and (version is None or soft.version == version): return soft.other diff --git a/ascend_deployer/scripts/pkg_utils.py b/ascend_deployer/scripts/pkg_utils.py index a30eabfe..b418a86b 100644 --- a/ascend_deployer/scripts/pkg_utils.py +++ b/ascend_deployer/scripts/pkg_utils.py @@ -214,12 +214,7 @@ tags_map = { 'need_nexus': False, 'name_keywords': ['mindie-image'], 'path_keywords': ['MindIE-image', ], - }, - 'npu_by_container': { - 'need_nexus': False, - 'name_keywords': [], - 'path_keywords': ['npu_container', ], - }, + } } pkg_run_paths = ( -- Gitee From c94010b0e2e267b2bbcd249f2977471a29a34fd2 Mon Sep 17 00:00:00 2001 From: xuchuan19 Date: Wed, 4 Jun 2025 10:06:28 +0800 Subject: [PATCH 5/7] =?UTF-8?q?=E9=80=82=E9=85=8Dnpu=E5=AE=B9=E5=99=A8?= =?UTF-8?q?=E5=8C=96=E5=AE=89=E8=A3=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ascend_deployer/scripts/pkg_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ascend_deployer/scripts/pkg_utils.py b/ascend_deployer/scripts/pkg_utils.py index b418a86b..d93c1ffe 100644 --- a/ascend_deployer/scripts/pkg_utils.py +++ b/ascend_deployer/scripts/pkg_utils.py @@ -214,7 +214,7 @@ tags_map = { 'need_nexus': False, 'name_keywords': ['mindie-image'], 'path_keywords': ['MindIE-image', ], - } + }, } pkg_run_paths = ( -- Gitee From 1d09e81fed01d3b7cebd8b25459c3dd1f4878632 Mon Sep 17 00:00:00 2001 From: WuQiang-Roy Date: Wed, 4 Jun 2025 10:35:43 +0800 Subject: [PATCH 6/7] clean code --- .../downloader/other_downloader.py | 2 +- .../software/KO-Files_25.0.RC1.json | 13 -- .../software/Ko-Files_24.1.RC3.json | 31 ---- ...0.RC1.json => NPU-Container_25.0.RC1.json} | 34 +++-- .../library/install_npu_by_container.py | 144 ++++++++++++------ .../module_utils/compatibility_config.py | 9 +- .../install/install_npu_by_container.yml | 3 +- .../playbooks/process/process_install.yml | 2 +- ascend_deployer/scripts/pkg_utils.py | 4 +- .../tools/npu_container/910A2/Dockerfile | 4 + .../tools/npu_container/910A2/install_ko.sh | 24 +++ .../tools/npu_container/910A2/installation.py | 100 +++++++----- 12 files changed, 218 insertions(+), 152 deletions(-) delete mode 100644 ascend_deployer/downloader/software/KO-Files_25.0.RC1.json delete mode 100644 ascend_deployer/downloader/software/Ko-Files_24.1.RC3.json rename ascend_deployer/downloader/software/{Ko-Files_25.0.RC1.json => NPU-Container_25.0.RC1.json} (64%) create mode 100644 ascend_deployer/tools/npu_container/910A2/install_ko.sh diff --git a/ascend_deployer/downloader/other_downloader.py b/ascend_deployer/downloader/other_downloader.py index 50160766..14b58cc7 100644 --- a/ascend_deployer/downloader/other_downloader.py +++ b/ascend_deployer/downloader/other_downloader.py @@ -84,7 +84,7 @@ class OtherDownloader: other_pkgs = [pkg for pkg in other_pkgs if "tfplugin" not in pkg.filename] download_dir = os.path.join(self._base_dir, "resources", "{0}_{1}".format(soft_ver.name, soft_ver.version)) self._mk_download_dir(other_pkgs, download_dir, soft_ver) - if soft_ver.name in ("CANN", "NPU", "FaultDiag", "MindIE-image", "KO-Files"): + if soft_ver.name in ("CANN", "NPU", "FaultDiag", "MindIE-image", "NPU-Container"): results = self._collect_pkgs_by_arch(arch, download_dir, self._base_dir, other_pkgs) else: results = [] diff --git a/ascend_deployer/downloader/software/KO-Files_25.0.RC1.json b/ascend_deployer/downloader/software/KO-Files_25.0.RC1.json deleted file mode 100644 index 2c68f109..00000000 --- a/ascend_deployer/downloader/software/KO-Files_25.0.RC1.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "name": "KO-Files", - "version": "25.0.RC1", - "default": true, - "other": [ - { - "filename": "openeuler-with-kmod-22.03-lts-aarch64.tar.gz", - "url": "https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/images/os/openeuler-with-kmod-22.03-lts-aarch64.tar.gz", - "sha256": "935e00ad4c0de79a31c0e0212e94fc6e7a68fb4896b125ec63c027e90689b01b", - "dest": "resources/npu_container" - } - ] -} \ No newline at end of file diff --git a/ascend_deployer/downloader/software/Ko-Files_24.1.RC3.json b/ascend_deployer/downloader/software/Ko-Files_24.1.RC3.json deleted file mode 100644 index 2a809feb..00000000 --- a/ascend_deployer/downloader/software/Ko-Files_24.1.RC3.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "name": "Ko-Files", - "version": "24.1.RC1", - "default": false, - "other": [ - { - "filename": "openeuler-with-kmod-22.03-lts-aarch64.tar.gz", - "url": "https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/images/os/openeuler-with-kmod-22.03-lts-aarch64.tar.gz", - "sha256": "935e00ad4c0de79a31c0e0212e94fc6e7a68fb4896b125ec63c027e90689b01b", - "dest": "resources/npu_container" - }, - { - "filename": "openeuler-with-kmod-22.03-lts-x86_64.tar.gz", - "url": "https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/images/os/openeuler-with-kmod-22.03-lts-x86_64.tar.gz", - "sha256": "0055a5e721460bbe8757cdf4ee0fa256eb43daf24e467c52a4e2aa4fe56206b0", - "dest": "resources/npu_container" - }, - { - "filename": "OpenEuler_22.03LTS_aarch64_ko_files_24.1.RC1.zip", - "url": "https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Ascend%20HDK/Ascend%20HDK%2024.1.0/A300-3000-npu_24.1.0_linux-aarch64.zip", - "sha256": "4dc7f0181ea069539d6a52d2f613e4478dcab084f1300f07ec54d8f79715f2ab", - "dest": "resources/npu_container" - }, - { - "filename": "OpenEuler_22.03LTS_x86_64_ko_files_24.1.RC1.zip", - "url": "https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Ascend%20HDK/Ascend%20HDK%2024.1.0/A300-3000-npu_24.1.0_linux-aarch64.zip", - "sha256": "4dc7f0181ea069539d6a52d2f613e4478dcab084f1300f07ec54d8f79715f2ab", - "dest": "resources/npu_container" - } - ] - } \ No newline at end of file diff --git a/ascend_deployer/downloader/software/Ko-Files_25.0.RC1.json b/ascend_deployer/downloader/software/NPU-Container_25.0.RC1.json similarity index 64% rename from ascend_deployer/downloader/software/Ko-Files_25.0.RC1.json rename to ascend_deployer/downloader/software/NPU-Container_25.0.RC1.json index 368ac070..e4156cd3 100644 --- a/ascend_deployer/downloader/software/Ko-Files_25.0.RC1.json +++ b/ascend_deployer/downloader/software/NPU-Container_25.0.RC1.json @@ -1,8 +1,26 @@ { - "name": "Ko-Files", + "name": "NPU-Container", "version": "25.0.RC1", "default": true, - "other": [ + "required_soft": [ + { + "name": "NPU", + "version": "25.0.RC1" + } + ], + "other": [ + { + "filename": "910A2_aarch64.tar.gz", + "url": "https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/images/npu_container/910A2.tar.gz", + "sha256": "07d77fada8971f1a3d7befdebfd93fa05d5c7233f395ad048fbcdc055fbba3d2", + "dest": "resources/npu_container" + }, + { + "filename": "910A2_x86_64.tar.gz", + "url": "https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/images/npu_container/910A2.tar.gz", + "sha256": "07d77fada8971f1a3d7befdebfd93fa05d5c7233f395ad048fbcdc055fbba3d2", + "dest": "resources/npu_container" + }, { "filename": "openeuler-with-kmod-22.03-lts-aarch64.tar.gz", "url": "https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/images/os/openeuler-with-kmod-22.03-lts-aarch64.tar.gz", @@ -14,18 +32,6 @@ "url": "https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/images/os/openeuler-with-kmod-22.03-lts-x86_64.tar.gz", "sha256": "0055a5e721460bbe8757cdf4ee0fa256eb43daf24e467c52a4e2aa4fe56206b0", "dest": "resources/npu_container" - }, - { - "filename": "OpenEuler_22.03LTS_aarch64_ko_files_25.0.RC1.zip", - "url": "https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Ascend%20HDK/Ascend%20HDK%2024.1.0/A300-3000-npu_24.1.0_linux-aarch64.zip", - "sha256": "4dc7f0181ea069539d6a52d2f613e4478dcab084f1300f07ec54d8f79715f2ab", - "dest": "resources/npu_container" - }, - { - "filename": "OpenEuler_22.03LTS_x86_64_ko_files_25.0.RC1.zip", - "url": "https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Ascend%20HDK/Ascend%20HDK%2024.1.0/A300-3000-npu_24.1.0_linux-aarch64.zip", - "sha256": "4dc7f0181ea069539d6a52d2f613e4478dcab084f1300f07ec54d8f79715f2ab", - "dest": "resources/npu_container" } ] } \ No newline at end of file diff --git a/ascend_deployer/library/install_npu_by_container.py b/ascend_deployer/library/install_npu_by_container.py index 5c0ac17a..fb47f81f 100644 --- a/ascend_deployer/library/install_npu_by_container.py +++ b/ascend_deployer/library/install_npu_by_container.py @@ -21,9 +21,9 @@ import glob import os from pathlib import Path import platform +import re import time -from utils import ROOT_PATH from ansible.module_utils.basic import AnsibleModule from ansible.module_utils.check_utils import CheckUtil from ansible.module_utils import common_info @@ -38,18 +38,21 @@ class Base: def __init__(self): self.module = AnsibleModule( argument_spec=dict( - resource_dir=dict(type="str", required=True), + resources_dir=dict(type="str", required=True), step=dict(type="str", required=True), image_name=dict(type="str", required=False), ) ) self.facts = dict() + self.resources_dir = os.path.expanduser(self.module.params["resources_dir"]) self.step = self.module.params["step"] self.image_name = self.module.params.get("image_name") + self.os_and_arch = get_os_and_arch() self.npu_name = self._get_npu_name() - self.npu_container = os.path.join(ROOT_PATH, "resources/npu_container") - self.docker_dir = os.path.join(ROOT_PATH, "tools/npu_container/{}".format(self.npu_name)) - self.needed_commands = ["docker", "unzip", "tar"] + self.npu_container = os.path.join(self.resources_dir, "npu_container") + self.docker_dir = os.path.join(self.npu_container, self.npu_name) + self.needed_commands = ["docker", "unzip", "tar", "modprobe"] + self._extract_tar() self._validator() @staticmethod @@ -62,6 +65,21 @@ class Base: if npu.endswith("910"): return NPUCardName.A910A1 return npu + + def _extract_tar(self): + pattern = "{}_ko_files*.tar.gz".format(self.os_and_arch) + matched_files = glob.glob(os.path.join(self.npu_container, pattern)) + if not matched_files: + self.module.fail_json( + changed=False, + rc=1, + msg="[ASCEND][[ERROR]]no ko_files tar.gz pkg found, pattern: {}".format(pattern) + ) + # format: 910A2_aarch64.tar.gz + pkg = [os.path.join(self.npu_container, "{}_{}.tar.gz".format(self.npu_name, ARCH))] + matched_files + for p in pkg: + command = "tar -xf {} -C {}".format(p, self.npu_container) + self.run(command) def _validator(self): if not Path(self.docker_dir).exists(): @@ -79,7 +97,7 @@ class Base: ) def run(self, cmd): - return_code, out, err = self.module.run_command(cmd) + return_code, out, err = self.module.run_command(cmd, use_unsafe_shell=True) output = out + err if return_code != 0: self.module.fail_json( @@ -87,7 +105,7 @@ class Base: rc=1, msg="[ASCEND][[ERROR]]run command: {} failed, output: {}".format(cmd, output) ) - return output + return output class ProcessNPUDriver(Base): @@ -98,8 +116,7 @@ class ProcessNPUDriver(Base): def __init__(self): super(ProcessNPUDriver, self).__init__() self.npu_info = common_info.get_npu_info() - self.resource_dir = os.path.expanduser(self.module.params["resource_dir"]) - self.driver_parent_dir = os.path.join(ROOT_PATH, "/resources/npu") + self.driver_parent_dir = os.path.join(self.resources_dir, "npu") self.driver_file_path = None self.messages = [] @@ -117,7 +134,7 @@ class ProcessNPUDriver(Base): arch = "x86?64" # old package mix x86-64 and x86_64 uniform_npu_scene = npu_scene = self.npu_info.get("scene") # uniform package has higher priority - uni_package_path = common_info.get_scene_dict(os.path.expanduser(self.resource_dir)).get(uniform_npu_scene) + uni_package_path = common_info.get_scene_dict(self.resources_dir).get(uniform_npu_scene) if uni_package_path: driver_file_path = self._find_files(uni_package_path, r"*npu-driver*linux*%s*.run" % arch) self.driver_file_path = driver_file_path or self.driver_file_path @@ -129,7 +146,7 @@ class ProcessNPUDriver(Base): self.module.fail_json( changed=False, rc=1, - msg="[ASCEND][[ERROR]]Do not find npu driver run file in {}.".format(self.resource_dir) + msg="[ASCEND][[ERROR]]Do not find npu driver run file in {}.".format(self.resources_dir) ) driver_dir = os.path.join(self.driver_parent_dir, "npu_driver") command = "bash {} --noexec --extract={}".format( @@ -140,20 +157,27 @@ class ProcessNPUDriver(Base): class ImageController(Base): - openeuler_with_kmod = "openeuler-with-kmod:22.03-lts" + base_image_name = "openeuler-with-kmod" + base_image_version = "22.03-lts" + timeout = 60 def __init__(self): super(ImageController, self).__init__() - self.version = datetime.now().strftime("%Y%m%d%H%M") - - def _install_base_image(self, base_image_name): - command = "docker load -i {}/{}".format(self.npu_container, base_image_name) + self.image_version = datetime.now().strftime("%Y%m%d") + self.container_version = datetime.now().strftime("%Y%m%d%H%M") + self.ko_folder = "/lib/modules/{}/npu_driver".format(os.uname().release) + self.install_ko_script = "install_ko.sh" + self.ascend_tool_path = "/usr/local/Ascend/driver/tools" + + def _load_base_image(self): + base_image_pkg_name = "{}-{}-{}.tar.gz".format(self.base_image_name, self.base_image_version, ARCH) + command = "docker load -i {}/{}".format(self.npu_container, base_image_pkg_name) self.run(command) def build_image(self): - image_name = "npu_driver_image:v{}".format(self.version) - self._install_base_image(self.openeuler_with_kmod) - command = "docker build -t {} .".format(image_name) + image_name = "npu_driver_image:v{}".format(self.image_version) + self._load_base_image() + command = "docker build -f {}/Dockerfile -t {} {}".format(self.docker_dir, image_name, self.docker_dir) self.run(command) command = "docker save {} -o {}/{}".format(image_name, self.npu_container, image_name) self.run(command) @@ -170,21 +194,47 @@ class ImageController(Base): command = "docker load -i {}".format(image_file) self.run(command) - def run_container(self, image_name): - container_name = "npu_driver_container_{}".format(self.version) + def run_container(self): + container_name = "npu_driver_container_{}".format(self.container_version) command = "docker run " \ "--privileged " \ - "--it -d " \ - "-v /lib:/lib " \ + "-it -d " \ + "-v /lib:/mnt/lib " \ "-v /usr/local:/usr/local " \ + "-v /root/.bashrc:/host_bashrc " \ "-v /etc:/etc " \ - "--name {} {}".format(container_name, image_name) + "--name {} {}".format(container_name, self.image_name) self.run(command) - return container_name + # query log + start = 0 + command = "docker logs {}".format(container_name) + while start <= self.timeout: + out = self.run(command) + if re.search(r"Finished", out): + return container_name + start += 1 + time.sleep(1) + self.module.fail_json( + msg="container started failed, please check the container log: 'docker logs {}'".format(container_name), + rc=1, + changed=True, + ) - def destroy_container(self, container_name): - command = "docker rm -f {}".format(container_name) + def install_ko(self): + script = os.path.join(self.ko_folder, self.install_ko_script) + self.run("bash {}".format(script)) + # make binary to executable + command = "chmod +x {}/*".format(self.ascend_tool_path) self.run(command) + + def destroy(self, container_name): + commands = [ + "docker rm -f {}".format(container_name), + "docker image rm -f {}".format(self.image_name), + "docker image rm -f {}:{}".format(self.base_image_name, self.base_image_version), + ] + for command in commands: + self.run(command) class InstallNPUDriverByContainer(Base): @@ -194,32 +244,28 @@ class InstallNPUDriverByContainer(Base): def __init__(self): super(InstallNPUDriverByContainer, self).__init__() - self.os_and_arch = get_os_and_arch() self.image_controller = ImageController() self.npu_driver = ProcessNPUDriver() self.kernel_version = platform.uname().release self.env_sh = Path("/etc/profile.d/ascend.sh") self._validator() - # def _unzip_ko_files(self): - # ko_files_zip = os.path.join(self.ko_files_dir, "{}_ko_files.zip".format(self.os_and_arch)) - # command = "unzip -q {}".format(ko_files_zip) - # self.run(command) - def _find_ko_files(self): """ - - /resources/ko_files/{}_ko_files + - /resources/npu_container/{os_and_arch}_ko_files*/{}_ko_files - 4.19.90 - 5.10.0 """ - folder = "{}_ko_files".format(self.os_and_arch) - unzipped_ko_files_dir = os.path.join(self.npu_container, folder) - if not Path(unzipped_ko_files_dir).exists(): + folder_pattern = "{}_ko_files*".format(self.os_and_arch) + matched_folder = glob.glob(os.path.join(self.npu_container, folder_pattern)) + matched_folder = [i for i in matched_folder if os.path.isdir(i)] + if not matched_folder: self.module.fail_json( changed=False, rc=1, - msg="[ASCEND][[ERROR]]folder({}} does not exist.".format(folder) + msg="[ASCEND][[ERROR]]folder({}} does not exist.".format(folder_pattern) ) + unzipped_ko_files_dir = matched_folder[0] # process kernel # kernel example: 4.19.90-vhulk2111.1.0.h963.eulerosv2r10.aarch64, 5.10.0-60.18.0.50.oe2203.aarch64 # process it as: 4.19.90, 5.10.0 @@ -243,11 +289,11 @@ class InstallNPUDriverByContainer(Base): def _copy_resources(self): driver_dir = self.npu_driver.extract_npu_driver() - # self._unzip_ko_files() - origin_ko_files_dir = self._find_ko_files() + origin_ko_files_dir = self._find_ko_files() commands = [ "cp -r {}/driver {} ".format(driver_dir, self.docker_dir), - "mkdir -p {}/ko_files && cp {}/* {}/ko_files".format(self.docker_dir, origin_ko_files_dir, self.docker_dir) + "mkdir -p {}/ko_files".format(self.docker_dir), + "cp {}/*.ko {}/ko_files".format(origin_ko_files_dir, self.docker_dir) ] for cmd in commands: self.run(cmd) @@ -257,7 +303,7 @@ class InstallNPUDriverByContainer(Base): max_retry = 10 for i in range(max_retry): if not Path(self.env_sh).exists(): - time.sleep((i+1) ** 2) + time.sleep((i + 1) ** 2) continue self.run(command) if not self.module.get_bin_path("npu-smi"): @@ -272,20 +318,26 @@ class InstallNPUDriverByContainer(Base): """ Main method to install the NPU driver by running a Docker container. """ - result = {} if self.step == self.step_build: - self._copy_resources + self._copy_resources() image_name = self.image_controller.build_image() self.facts["image_name"] = image_name - self.module.exist_json( + self.module.exit_json( + rc=0, changed=True, msg="build {} successfully.".format(image_name), ansible_facts=self.facts, ) elif self.step == self.step_install: container_name = self.image_controller.run_container() + self.image_controller.install_ko() self._query_result(container_name) - self.image_controller.destroy_container(container_name) + self.image_controller.destroy(container_name) + self.module.exit_json( + rc=0, + changed=True, + msg="install npu driver by container successfully." + ) else: self.module.fail_json( changed=False, diff --git a/ascend_deployer/module_utils/compatibility_config.py b/ascend_deployer/module_utils/compatibility_config.py index 3e134d7d..579f854c 100644 --- a/ascend_deployer/module_utils/compatibility_config.py +++ b/ascend_deployer/module_utils/compatibility_config.py @@ -63,6 +63,7 @@ class Tags: DRIVER = "driver" FIRMWARE = "firmware" NPU = "npu" + NPU_BY_CONTAINER = "npu_by_container" MCU = "mcu" TFPLUGIN = "tfplugin" NNAE = "nnae" @@ -105,7 +106,8 @@ class Tags: DRIVER, FIRMWARE, NPU, - MCU + MCU, + NPU_BY_CONTAINER } # The tfplugin component was removed after version 8.0.0. @@ -187,8 +189,9 @@ class HardwareOSTags: A300T_A2_SUPPORT_TAGS = (Tags.BASIC_TAGS | Tags.MINDCLUSTER_TAGS | Tags.AI_FRAMEWORKS_TAGS) # 800i-a2 support mindie_image - ATLAS_800I_A2_SUPPORT_TAGS = (Tags.BASIC_TAGS | Tags.MINDCLUSTER_TAGS | {Tags.MINDIE_IMAGE, Tags.OFFLINE_DEV, - Tags.OFFLINE_RUN}) + ATLAS_800I_A2_SUPPORT_TAGS = ( + Tags.BASIC_TAGS | Tags.MINDCLUSTER_TAGS | + {Tags.MINDIE_IMAGE, Tags.OFFLINE_DEV, Tags.OFFLINE_RUN}) ATLAS_800I_A3_SUPPORT_TAGS = (Tags.BASIC_TAGS | Tags.MINDCLUSTER_TAGS) - {Tags.DOCKER_IMAGES} diff --git a/ascend_deployer/playbooks/install/install_npu_by_container.yml b/ascend_deployer/playbooks/install/install_npu_by_container.yml index e32c9837..4ab90c15 100644 --- a/ascend_deployer/playbooks/install/install_npu_by_container.yml +++ b/ascend_deployer/playbooks/install/install_npu_by_container.yml @@ -1,5 +1,5 @@ - hosts: - - master[0] + - worker[0] - other_build_image name: build image for npu driver tasks: @@ -21,5 +21,6 @@ - name: install npu driver by container install_npu_by_container: + resources_dir: "{{ resource_path }}" step: 'install' image_name: "{{ image_name }}" \ No newline at end of file diff --git a/ascend_deployer/playbooks/process/process_install.yml b/ascend_deployer/playbooks/process/process_install.yml index e2328051..a1ef0944 100644 --- a/ascend_deployer/playbooks/process/process_install.yml +++ b/ascend_deployer/playbooks/process/process_install.yml @@ -19,7 +19,7 @@ tags: driver,firmware,npu - name: install npu driver by container - import_playbool: ../install/install_npu_by_container + import_playbook: ../install/install_npu_by_container.yml tags: npu_by_container - name: install atlasedge diff --git a/ascend_deployer/scripts/pkg_utils.py b/ascend_deployer/scripts/pkg_utils.py index e527a4e8..c5876c8e 100644 --- a/ascend_deployer/scripts/pkg_utils.py +++ b/ascend_deployer/scripts/pkg_utils.py @@ -217,8 +217,8 @@ tags_map = { }, 'npu_by_container': { 'need_nexus': False, - 'name_keywords': ['ko_files'], - 'path_keywords': ['npu_container'], + 'name_keywords': ['*', "npu"], + 'path_keywords': ['npu_container', 'run_from_*_zip'], }, } diff --git a/ascend_deployer/tools/npu_container/910A2/Dockerfile b/ascend_deployer/tools/npu_container/910A2/Dockerfile index a2673a3a..32bc3925 100644 --- a/ascend_deployer/tools/npu_container/910A2/Dockerfile +++ b/ascend_deployer/tools/npu_container/910A2/Dockerfile @@ -6,7 +6,11 @@ COPY davinci.conf . COPY dms_events_conf.lst . COPY ./driver /app/driver COPY ./ko_files /app/ko_files +COPY install_ko.sh . COPY installation.py . COPY entrypoint.sh . +RUN chmod +x /app/entrypoint.sh +RUN chmod +x /app/install_ko.sh + CMD ["/app/entrypoint.sh"] \ No newline at end of file diff --git a/ascend_deployer/tools/npu_container/910A2/install_ko.sh b/ascend_deployer/tools/npu_container/910A2/install_ko.sh new file mode 100644 index 00000000..799533a3 --- /dev/null +++ b/ascend_deployer/tools/npu_container/910A2/install_ko.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +echo "start depmod..." +sudo depmod +echo "finish depmod." +kernel_version=$(uname -r) +dir="/lib/modules/$kernel_version/npu_driver" +if [ -d "$dir" ]; then + echo "Files in $dir:" + for file in "$dir"/*.ko; do + if [ -f "$file" ]; then + module_name=$(basename "$file" .ko) + echo "Loading module: $module_name" + sudo modprobe "$module_name" + if [ $? -eq 0 ]; then + echo "Module $module_name loaded successfully." + else + echo "Failed to load module $module_name." + fi + fi + done +else + echo "Directory $dir does not exist." +fi \ No newline at end of file diff --git a/ascend_deployer/tools/npu_container/910A2/installation.py b/ascend_deployer/tools/npu_container/910A2/installation.py index 250d2551..b9ed751b 100644 --- a/ascend_deployer/tools/npu_container/910A2/installation.py +++ b/ascend_deployer/tools/npu_container/910A2/installation.py @@ -20,8 +20,16 @@ import glob import os from pathlib import Path import re +import shutil import subprocess from typing import List +import logging + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + datefmt="%Y-%m-%d %H:%M:%S" +) class Commands: @@ -29,13 +37,13 @@ class Commands: A class to handle shell commands execution. """ @staticmethod - def run(command: List[str]) -> None: + def run(command: str) -> None: """ Execute a shell command. :param command: The command to execute. :return: The output of the command. """ - result = subprocess.run(command, shell=True, check=True) + result = subprocess.run(command, check=True, shell=True) if result.returncode != 0: raise RuntimeError( f"Command '{command}' failed with return code {result.returncode}. " @@ -50,50 +58,58 @@ class Installation: def __init__(self): self.kernel_version = os.uname().release - self.ko_folder = f"/lib/modules/{self.kernel_version}/npu_driver" - + self.ko_folder = f"/mnt/lib/modules/{self.kernel_version}/npu_driver" def _setup(self): """ - Setup the installation environment. + Setup the installation environment """ - commands = [ - ["mkdir", "/usr/local/Ascend"], - ["mkdir", "-p", "/user/local/Ascend/driver/lib64/common"], - ["mkdir", "-p", self.ko_folder], + paths = [ + "/usr/local/Ascend", + "/user/local/Ascend/driver/lib64/common", + self.ko_folder ] - for command in commands: - Commands().run(command) - print("Setup completed successfully.") + + for path in paths: + if os.path.exists(path): + logging.info(f"Removing existing directory: {path}") + shutil.rmtree(path) + os.makedirs(path, exist_ok=True) + logging.info(f"Created directory: {path}") + + logging.info("Setup completed successfully.") def _copy_resources(self): """ Copy necessary resources for installation. """ commands = [ - ["cp", "-r", "/app/driver /usr/local/Ascend/"], - ["cp", "-r", "/app/davinci.conf /lib/"], - ["cp", "-r", "/app/dms_events.conf", "/etc/"], - ["cp", "/usr/local/Ascend/driver/lib64/*.so", "/usr/local/Ascend/driver/lib64/common"], - ["cp", "/app/ko_files/*.ko", self.ko_folder], + "cp -r /app/driver /usr/local/Ascend/", + "cp -r /app/davinci.conf /mnt/lib/", + "cp -r /app/dms_events_conf.lst /etc/", + f"cp -r /app/install_ko.sh {self.ko_folder}", + "cp /usr/local/Ascend/driver/lib64/*.so /usr/local/Ascend/driver/lib64/common", + f"cp /app/ko_files/*.ko {self.ko_folder}", ] for command in commands: Commands.run(command) - print("Resources copied successfully.") + logging.info("Resources copied successfully.") - def _update_permissions(self): + @staticmethod + def _update_permissions(): """ Update permissions for the copied resources. """ commands = [ - ["chmod", "777", "/lib/davinci.conf"], - ["chmod", "777", "/etc/dms_events.conf"] + "chmod 777 /mnt/lib/davinci.conf", + "chmod 777 /etc/dms_events_conf.lst" ] for command in commands: Commands.run(command) - print("Permission updated successfully.") + logging.info("Permission updated successfully.") - def _extract_array(self, content: str, key: str) -> List[str]: + @staticmethod + def _extract_array(content: str, key: str) -> List[str]: """ Extract an array of strings from a given content based on a key. :param content: The content to search in. @@ -127,21 +143,11 @@ class Installation: raise FileNotFoundError( f"Source file '{src_file}' does not exist.") - Commands.run(["mv", src_file, dst_file]) - print(f"Renaming {src_file} to {dst_file} successfully.") + Commands.run(f"mv {src_file} {dst_file}") + logging.info(f"Renaming {src_file} to {dst_file} successfully.") - def _install_ko(self): - """ - Install kernel objects (ko files). - """ - for ko_file in Path(self.ko_folder).glob("*.ko"): - if ko_file.is_file(): - module_name = ko_file.stem - print(f"Installing kernel object: {ko_file.name}") - Commands.run(["modprobe", module_name]) - print(f"Installed kernel object: {ko_file.name}") - - def _configure_env(self): + @staticmethod + def _configure_env(): """ Configure the environment for the installation. """ @@ -149,15 +155,28 @@ class Installation: env_content = ( "export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/common:" "/usr/local/Ascend/driver/lib64/driver:$LD_LIBRARY_PATH\n" + "export PATH=$PATH:/usr/local/Ascend/driver/tools/" ) try: with env_path.open("w", encoding="utf-8") as env_file: env_file.write(env_content) - Commands.run(["bash", str(env_path)]) - print("Environment configured successfully.") + logging.info("Environment configured successfully.") except Exception as e: raise RuntimeError(f"Failed to configure environment: {e}") from e + + # add env command to bashrc + # mount: /root/.bashrc:/host_bashrc + bashrc_path = Path("/host_bashrc") + env_command = f"bash {env_path}\n" + try: + if bashrc_path.exists(): + content = bashrc_path.read_text(encoding="utf-8") + if env_command.strip() not in content: + with bashrc_path.open("a", encoding="utf-8") as bashrc_file: + bashrc_file.write(env_command) + except Exception as e: + raise RuntimeError(f"Failed to add env command to bashrc: {e}") from e def install(self): """ @@ -168,10 +187,11 @@ class Installation: self._copy_resources() self._update_permissions() self._update_specific_func() - self._install_ko() self._configure_env() if __name__ == "__main__": installer = Installation() installer.install() + logging.info("Finished setting pre-env") + -- Gitee From 9e1463ba3cb9d8cdc8ef554246ab232f57c9376a Mon Sep 17 00:00:00 2001 From: WuQiang-Roy Date: Thu, 5 Jun 2025 16:42:27 +0800 Subject: [PATCH 7/7] update --- .../tools/npu_container/910A2/install_ko.sh | 4 ++-- .../tools/npu_container/910A2/installation.py | 18 ++++++++++++++---- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/ascend_deployer/tools/npu_container/910A2/install_ko.sh b/ascend_deployer/tools/npu_container/910A2/install_ko.sh index 799533a3..fcf8a7db 100644 --- a/ascend_deployer/tools/npu_container/910A2/install_ko.sh +++ b/ascend_deployer/tools/npu_container/910A2/install_ko.sh @@ -1,7 +1,7 @@ #!/bin/bash echo "start depmod..." -sudo depmod +depmod echo "finish depmod." kernel_version=$(uname -r) dir="/lib/modules/$kernel_version/npu_driver" @@ -11,7 +11,7 @@ if [ -d "$dir" ]; then if [ -f "$file" ]; then module_name=$(basename "$file" .ko) echo "Loading module: $module_name" - sudo modprobe "$module_name" + modprobe "$module_name" if [ $? -eq 0 ]; then echo "Module $module_name loaded successfully." else diff --git a/ascend_deployer/tools/npu_container/910A2/installation.py b/ascend_deployer/tools/npu_container/910A2/installation.py index b9ed751b..101d129e 100644 --- a/ascend_deployer/tools/npu_container/910A2/installation.py +++ b/ascend_deployer/tools/npu_container/910A2/installation.py @@ -59,6 +59,7 @@ class Installation: def __init__(self): self.kernel_version = os.uname().release self.ko_folder = f"/mnt/lib/modules/{self.kernel_version}/npu_driver" + self.ascend_tool_path = "/usr/local/Ascend/driver/tools" def _setup(self): """ @@ -87,7 +88,6 @@ class Installation: "cp -r /app/driver /usr/local/Ascend/", "cp -r /app/davinci.conf /mnt/lib/", "cp -r /app/dms_events_conf.lst /etc/", - f"cp -r /app/install_ko.sh {self.ko_folder}", "cp /usr/local/Ascend/driver/lib64/*.so /usr/local/Ascend/driver/lib64/common", f"cp /app/ko_files/*.ko {self.ko_folder}", ] @@ -145,9 +145,19 @@ class Installation: Commands.run(f"mv {src_file} {dst_file}") logging.info(f"Renaming {src_file} to {dst_file} successfully.") - + @staticmethod - def _configure_env(): + def _install_ko(): + command = "bash /app/install_ko.sh" + Commands.run(command) + logging.info(f"Installed ko files successfully.") + + def _make_file_executable(self): + command = f"chmod +x {self.ascend_tool_path}/*" + Commands.run(command) + logging.info(f"Make all the file in {self.ascend_tool_path} to executable successfully.") + + def _configure_env(self): """ Configure the environment for the installation. """ @@ -155,7 +165,7 @@ class Installation: env_content = ( "export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/common:" "/usr/local/Ascend/driver/lib64/driver:$LD_LIBRARY_PATH\n" - "export PATH=$PATH:/usr/local/Ascend/driver/tools/" + f"export PATH=$PATH:{self.ascend_tool_path}/" ) try: -- Gitee