From c513fd34529b6bcbf51a4b6432a3ed0f4c29a9ba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com>
Date: Thu, 13 Feb 2025 20:11:23 +0800
Subject: [PATCH 01/21] adaround

---
 .../ada_round_calibration/README_CN.md        |  72 ++++
 .../ada_round_calibration/data/.keep          |   0
 .../ada_round_calibration/model/.keep         |   0
 .../ada_round_calibration/src/__init__.py     |   0
 .../src/config/quant.cfg                      |   5 +
 .../src/resnet-18_calibration.py              | 172 +++++++++
 .../ada_round_calibration/src/resnet.py       | 357 ++++++++++++++++++
 7 files changed, 606 insertions(+)
 create mode 100644 python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/README_CN.md
 create mode 100644 python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/data/.keep
 create mode 100644 python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/model/.keep
 create mode 100644 python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/__init__.py
 create mode 100644 python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/config/quant.cfg
 create mode 100644 python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/resnet-18_calibration.py
 create mode 100644 python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/resnet.py

diff --git a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/README_CN.md b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/README_CN.md
new file mode 100644
index 000000000..5c8c45ad8
--- /dev/null
+++ b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/README_CN.md
@@ -0,0 +1,72 @@
+# ResNet-101
+
+## 1. HIF8/FP8校准
+
+### 1.1 量化前提
+
++ **模型准备**  
+请下载 [ResNet-18](https://download.pytorch.org/models/resnet18-5c106cde.pth) 模型文件并保存到`model`目录。
+
+
++ **数据集准备**  
+使用昇腾模型压缩工具对模型完成量化后，需要对模型进行推理，以测试量化数据的精度。推理过程中需要使用与模型相匹配的数据集。请下载[测试图片](https://obs-9be7.obs.cn-east-2.myhuaweicloud.com/003_Atc_Models/AE/ATC%20Model/resnet-101_nuq/images.zip)，解压后将`images`文件夹放到`data`目录。
+
++ **校准集准备**  
+校准集用来产生量化因子，保证精度。本 sample 校准集与数据集相同。
+
+### 1.2 简易量化配置
+./config/ada_round.cfg文件为用户自定义的简易量化配置，具体表示信息如下：
+
+
+| 字段 |类型| 说明 | 默认值 | 取值范围 |
+|:--| :-: | :-- | :-: | :-: |
+|common_config.ada_quantize.num_iteration|uint|adaround训练迭代次数|10000|>0|
+|common_config.ada_quantize.warm_start|float|预热因子|0.2|(0,1)|
+|common_config.ada_quantize.reg_param|float|正则化参数|0.01|(0,1)|
+|common_config.ada_quantize.channel_wise|bool|是否开启pre channel|true|false/true|
+|common_config.ada_quantize.beta_range_start|uint|beta衰减范围|20|/|
+|common_config.ada_quantize.beta_range_end|uint|beta衰减范围|2|/|
+
+更多参数配置请参考[资料](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/800alpha003/devaids/devtools/amct/atlasamct_16_0131.html)
+
+
+### 1.3 量化示例
+
+执行量化示例前，请先检查当前目录下是否包含以下文件及目录，其中 images 文件夹内部包含有 160 张用于校准和测试的图片：
+
++ [data](./data/)
+  + images
++ [model](./model/)
+  + resnet18-5c106cde.pth
++ [src](./src/)
+  + [config/ada_round.cfg](./src/config/ada_round.cfg)
+  + [\_\_init__.py](./src/__init__.py)
+  + [resnet-18_calibration.py](./src/resnet-18_calibration.py)
+  + [resnet.py](./src/resnet.py)
+
+请在当前目录执行如下命令运行示例程序：
+
+```bash
+CUDA_VISIBLE_DEVICES=0 python ./src/resnet-18_calibration.py
+```
+
+> 其中 `CUDA_VISIBLE_DEVICES` 是必填参数，表示使用 CPU 还是 GPU 进行量化，参数取值为：
+>
+> + -1：使用 CPU 进行量化。
+> + 其他 Device ID使用 GPU 进行量化，具体 ID 请以用户实际环境为准。
+> 
+
+
+### 1.4 量化结果
+
+量化成功后，在当前目录会生成量化日志文件 ./amct_log/amct_pytorch.log 和 ./outputs/calibration 文件夹，该文件夹内包含以下内容：
+
+
++ tmp: 临时文件夹
+  + config.json: 量化配置文件，描述了如何对模型中的每一层进行量化。
+  + record.txt: 量化因子记录文件记录量化因子。
+  + modified_model.onnx: 临时模型文件，即原始的 PyTorch 模型 BN 融合后导出的 ONNX 模型文件。
++ resnet-18_deploy_model.onnx: 量化部署模型，即量化后的可在昇腾 AI 处理器部署的模型文件。
++ resnet-18_fake_quant_model.onnx: 量化仿真模型，即量化后的可在 ONNX 执行框架 ONNX Runtime 进行精度仿真的模型
+
+> 如果量化脚本所在目录下已经存在量化配置文件，则再次调用 `create_quant_config` 接口时，如果新生成的量化配置文件与已有的文件同名，则会覆盖已有的量化配置文件，否则生成新的量化配置文件。
diff --git a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/data/.keep b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/data/.keep
new file mode 100644
index 000000000..e69de29bb
diff --git a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/model/.keep b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/model/.keep
new file mode 100644
index 000000000..e69de29bb
diff --git a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/__init__.py b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/config/quant.cfg b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/config/quant.cfg
new file mode 100644
index 000000000..bdbebea42
--- /dev/null
+++ b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/config/quant.cfg
@@ -0,0 +1,5 @@
+common_config : {
+    ada_quantize : {
+        num_iteration : 2000
+    }
+}
diff --git a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/resnet-18_calibration.py b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/resnet-18_calibration.py
new file mode 100644
index 000000000..d41a31b40
--- /dev/null
+++ b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/resnet-18_calibration.py
@@ -0,0 +1,172 @@
+"""
+# Copyright 2025 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License. 
+"""
+
+
+import os
+import torch # pylint: disable=E0401
+from PIL import Image # pylint: disable=E0401
+from torchvision import transforms # pylint: disable=E0401
+import onnxruntime as ort
+
+import amct_pytorch as amct # pylint: disable=E0401
+from resnet import resnet18 # pylint: disable=E0401, C0415
+
+
+PATH = os.path.realpath('./')
+IMG_DIR = os.path.join(PATH, 'data/images')
+LABEL_FILE = os.path.join(IMG_DIR, 'image_label.txt')
+
+OUTPUTS = os.path.join(PATH, 'outputs/calibration')
+
+TMP = os.path.join(OUTPUTS, 'tmp')
+
+
+def get_labels_from_txt(label_file):
+    """Read all images' name and label from label_file"""
+    images = []
+    labels = []
+    with open(label_file, 'r') as f:
+        lines = f.readlines()
+        for line in lines:
+            images.append(line.split(' ')[0])
+            labels.append(int(line.split(' ')[1]))
+    return images, labels
+
+
+def prepare_image_input(images):
+    """Read all images"""
+    input_tensor = torch.zeros(len(images), 3, 224, 224) # pylint: disable=E1101
+    preprocess = transforms.Compose(
+        [transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
+    for index, image in enumerate(images):
+        input_image = Image.open(image).convert('RGB')
+        input_tensor[index, ...] = preprocess(input_image)
+    return input_tensor
+
+
+def img_postprocess(probs, labels):
+    """Do image post-process"""
+    # calculate top1 and top5 accuracy
+    top1_get = 0
+    top5_get = 0
+    prob_size = probs.shape[1]
+    for index, label in enumerate(labels):
+        top5_record = (probs[index, :].argsort())[prob_size - 5: prob_size]
+        if label == top5_record[-1]:
+            top1_get += 1
+            top5_get += 1
+        elif label in top5_record:
+            top5_get += 1
+    return float(top1_get) / len(labels), float(top5_get) / len(labels)
+
+
+def model_forward(model, batch_size, iterations):
+    """Do pytorch model forward"""
+    images, labels = get_labels_from_txt(LABEL_FILE)
+    images = [os.path.join(IMG_DIR, image) for image in images]
+    top1_total = 0
+    top5_total = 0
+    for i in range(iterations):
+        input_batch = prepare_image_input(images[i * batch_size: (i + 1) * batch_size])
+        # move the input and model to GPU for speed if available
+        if torch.cuda.is_available():
+            input_batch = input_batch.to('cuda')
+            model.to('cuda')
+
+        with torch.no_grad():
+            output = model(input_batch)
+        top1, top5 = img_postprocess(output, labels[i * batch_size: (i + 1) * batch_size])
+        top1_total += top1
+        top5_total += top5
+        print('****************iteration:{}*****************'.format(i))
+        print('top1_acc:{}'.format(top1))
+        print('top5_acc:{}'.format(top5))
+    print('******final top1:{}'.format(top1_total / iterations))
+    print('******final top5:{}'.format(top5_total / iterations))
+    return top1_total / iterations, top5_total / iterations
+
+
+def onnx_forward(onnx_model, batch_size, iterations):
+    """Do onnx model forward"""
+    ort_session = ort.InferenceSession(onnx_model)
+
+    images, labels = get_labels_from_txt(LABEL_FILE)
+    images = [os.path.join(IMG_DIR, image) for image in images]
+    top1_total = 0
+    top5_total = 0
+    for i in range(iterations):
+        input_batch = prepare_image_input(images[i * batch_size: (i + 1) * batch_size])
+        output = ort_session.run(None, {'input': input_batch.numpy()})
+        top1, top5 = img_postprocess(output[0], labels[i * batch_size: (i + 1) * batch_size])
+        top1_total += top1
+        top5_total += top5
+        print('****************iteration:{}*****************'.format(i))
+        print('top1_acc:{}'.format(top1))
+        print('top5_acc:{}'.format(top5))
+    print('******final top1:{}'.format(top1_total / iterations))
+    print('******final top5:{}'.format(top5_total / iterations))
+    return top1_total / iterations, top5_total / iterations
+
+
+def main():
+    """Sample main function"""
+    model = resnet18(pretrained=True)
+
+    model.eval()
+    ori_top1, ori_top5 = model_forward(model, batch_size=32, iterations=5)
+
+    # Quantize configurations
+    images, _ = get_labels_from_txt(LABEL_FILE)
+    images = [os.path.join(IMG_DIR, image) for image in images]
+    input_data = prepare_image_input(images[:32])
+    if torch.cuda.is_available():
+        input_data = (input_data.to('cuda'))
+        model.to('cuda')
+    config_json_file = os.path.join(TMP, 'config.json')
+    skip_layers = []
+    batch_num = 2
+
+    config_defination = os.path.join(PATH, 'src/config/ada_round.cfg')
+    amct.create_quant_config(config_json_file, model, input_data, skip_layers, batch_num, config_defination=config_defination)
+
+
+    # Phase1: do conv+bn fusion, weights calibration and generate
+    #         calibration model
+    record_file = os.path.join(TMP, 'record.txt')
+    modified_model = os.path.join(TMP, 'modified_model.onnx')
+    calibration_model = amct.quantize_model(
+        config_json_file, modified_model, record_file, model, input_data, input_names=['input'],
+        output_names=['output'], dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}})
+
+    # Phase2: do calibration
+    model_forward(calibration_model, batch_size=32, iterations=batch_num)
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+
+    # Phase3: save final model, one for onnx do fake quant test, one
+    #         deploy model for ATC
+    result_path = os.path.join(OUTPUTS, 'resnet-18')
+    amct.save_model(modified_model, record_file, result_path)
+
+    # Phase4: run fake_quant model test
+    quant_top1, quant_top5 = onnx_forward(
+        '%s_%s' % (result_path, 'fake_quant_model.onnx'), batch_size=32, iterations=5)
+    print('[INFO] ResNet18 before quantize top1:{:>10} top5:{:>10}'.format(ori_top1, ori_top5))
+    print('[INFO] ResNet18 after quantize  top1:{:>10} top5:{:>10}'.format(quant_top1, quant_top5))
+
+if __name__ == '__main__':
+    main()
diff --git a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/resnet.py b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/resnet.py
new file mode 100644
index 000000000..ea1a4b56d
--- /dev/null
+++ b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/resnet.py
@@ -0,0 +1,357 @@
+import os
+import torch
+import torch.nn as nn
+
+
+__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
+           'resnet152', 'resnext50_32x4d', 'resnext101_32x8d',
+           'wide_resnet50_2', 'wide_resnet101_2']
+
+
+model_urls = {
+    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
+    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
+    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
+    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
+    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
+    'resnext50_32x4d': 'https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth',
+    'resnext101_32x8d': 'https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth',
+    'wide_resnet50_2': 'https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth',
+    'wide_resnet101_2': 'https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth',
+}
+
+
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=dilation, groups=groups, bias=False, dilation=dilation)
+
+
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
+
+
+class BasicBlock(nn.Module):
+    """BasicBlock"""
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
+                 base_width=64, dilation=1, norm_layer=None):
+        super(BasicBlock, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        if groups != 1 or base_width != 64:
+            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
+        if dilation > 1:
+            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        """forward"""
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    """
+    Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
+    while original implementation places the stride at the first 1x1 convolution(self.conv1)
+    according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
+    This variant is also known as ResNet V1.5 and improves accuracy according to
+    https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
+    """
+
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
+                 base_width=64, dilation=1, norm_layer=None):
+        super(Bottleneck, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        width = int(planes * (base_width / 64.)) * groups
+        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv1x1(inplanes, width)
+        self.bn1 = norm_layer(width)
+        self.conv2 = conv3x3(width, width, stride, groups, dilation)
+        self.bn2 = norm_layer(width)
+        self.conv3 = conv1x1(width, planes * self.expansion)
+        self.bn3 = norm_layer(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        """forward"""
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet(nn.Module):
+    """ResNet"""
+    def __init__(self, block, layers, num_classes=1000, zero_init_residual=False,
+                 groups=1, width_per_group=64, replace_stride_with_dilation=None,
+                 norm_layer=None):
+        super(ResNet, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        self._norm_layer = norm_layer
+
+        self.inplanes = 64
+        self.dilation = 1
+        if replace_stride_with_dilation is None:
+            # each element in the tuple indicates if we should replace
+            # the 2x2 stride with a dilated convolution instead
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError("replace_stride_with_dilation should be None "
+                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
+        self.groups = groups
+        self.base_width = width_per_group
+        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3,
+                               bias=False)
+        self.bn1 = norm_layer(self.inplanes)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
+                                       dilate=replace_stride_with_dilation[0])
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
+                                       dilate=replace_stride_with_dilation[1])
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
+                                       dilate=replace_stride_with_dilation[2])
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.fc = nn.Linear(512 * block.expansion, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+        # Zero-initialize the last BN in each residual branch,
+        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
+        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, Bottleneck):
+                    nn.init.constant_(m.bn3.weight, 0)
+                elif isinstance(m, BasicBlock):
+                    nn.init.constant_(m.bn2.weight, 0)
+
+    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
+        norm_layer = self._norm_layer
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                norm_layer(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
+                            self.base_width, previous_dilation, norm_layer))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(block(self.inplanes, planes, groups=self.groups,
+                                base_width=self.base_width, dilation=self.dilation,
+                                norm_layer=norm_layer))
+
+        return nn.Sequential(*layers)
+
+    def _forward_impl(self, x):
+        # See note [TorchScript super()]
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.avgpool(x)
+        x = torch.reshape(x, (x.shape[0], x.shape[1]))
+        x = self.fc(x)
+
+        return x
+
+    def forward(self, x):
+        """forward"""
+        return self._forward_impl(x)
+
+
+def _resnet(arch, block, layers, pretrained, progress, **kwargs):
+    model = ResNet(block, layers, **kwargs)
+    if pretrained:
+        model.load_state_dict(torch.load('./model/resnet101-5d3b4d8f.pth'))
+    return model
+
+
+def resnet18(pretrained=False, progress=True, **kwargs):
+    r"""ResNet-18 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, progress,
+                   **kwargs)
+
+
+def resnet34(pretrained=False, progress=True, **kwargs):
+    r"""ResNet-34 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet34', BasicBlock, [3, 4, 6, 3], pretrained, progress,
+                   **kwargs)
+
+
+def resnet50(pretrained=False, progress=True, **kwargs):
+    r"""ResNet-50 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet50', Bottleneck, [3, 4, 6, 3], pretrained, progress,
+                   **kwargs)
+
+
+def resnet101(pretrained=False, progress=True, **kwargs):
+    r"""ResNet-101 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet101', Bottleneck, [3, 4, 23, 3], pretrained, progress,
+                   **kwargs)
+
+
+def resnet152(pretrained=False, progress=True, **kwargs):
+    r"""ResNet-152 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet152', Bottleneck, [3, 8, 36, 3], pretrained, progress,
+                   **kwargs)
+
+
+def resnext50_32x4d(pretrained=False, progress=True, **kwargs):
+    r"""ResNeXt-50 32x4d model from
+    `"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['groups'] = 32
+    kwargs['width_per_group'] = 4
+    return _resnet('resnext50_32x4d', Bottleneck, [3, 4, 6, 3],
+                   pretrained, progress, **kwargs)
+
+
+def resnext101_32x8d(pretrained=False, progress=True, **kwargs):
+    r"""ResNeXt-101 32x8d model from
+    `"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['groups'] = 32
+    kwargs['width_per_group'] = 8
+    return _resnet('resnext101_32x8d', Bottleneck, [3, 4, 23, 3],
+                   pretrained, progress, **kwargs)
+
+
+def wide_resnet50_2(pretrained=False, progress=True, **kwargs):
+    r"""Wide ResNet-50-2 model from
+    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_
+
+    The model is the same as ResNet except for the bottleneck number of channels
+    which is twice larger in every block. The number of channels in outer 1x1
+    convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
+    channels, and in Wide ResNet-50-2 has 2048-1024-2048.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['width_per_group'] = 64 * 2
+    return _resnet('wide_resnet50_2', Bottleneck, [3, 4, 6, 3],
+                   pretrained, progress, **kwargs)
+
+
+def wide_resnet101_2(pretrained=False, progress=True, **kwargs):
+    r"""Wide ResNet-101-2 model from
+    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_
+
+    The model is the same as ResNet except for the bottleneck number of channels
+    which is twice larger in every block. The number of channels in outer 1x1
+    convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
+    channels, and in Wide ResNet-50-2 has 2048-1024-2048.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['width_per_group'] = 64 * 2
+    return _resnet('wide_resnet101_2', Bottleneck, [3, 4, 23, 3],
+                   pretrained, progress, **kwargs)
-- 
Gitee


From fea7862b68fd043f713d336de5f30f134a48aa6a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com>
Date: Fri, 14 Feb 2025 15:36:35 +0800
Subject: [PATCH 02/21] fix

---
 .../src/config/{quant.cfg => ada_round.cfg}                     | 1 +
 .../9_amct/amct_pytorch/ada_round_calibration/src/resnet.py     | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)
 rename python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/config/{quant.cfg => ada_round.cfg} (72%)

diff --git a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/config/quant.cfg b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/config/ada_round.cfg
similarity index 72%
rename from python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/config/quant.cfg
rename to python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/config/ada_round.cfg
index bdbebea42..a9d4989d8 100644
--- a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/config/quant.cfg
+++ b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/config/ada_round.cfg
@@ -1,5 +1,6 @@
 common_config : {
     ada_quantize : {
         num_iteration : 2000
+        channel_wise : false
     }
 }
diff --git a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/resnet.py b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/resnet.py
index ea1a4b56d..475aaafb9 100644
--- a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/resnet.py
+++ b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/resnet.py
@@ -229,7 +229,7 @@ class ResNet(nn.Module):
 def _resnet(arch, block, layers, pretrained, progress, **kwargs):
     model = ResNet(block, layers, **kwargs)
     if pretrained:
-        model.load_state_dict(torch.load('./model/resnet101-5d3b4d8f.pth'))
+        model.load_state_dict(torch.load('./model/resnet18-5c106cde.pth'))
     return model
 
 
-- 
Gitee


From 550affe7439012c36c9b0389cbd81677d359b472 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com>
Date: Sat, 15 Feb 2025 16:12:38 +0800
Subject: [PATCH 03/21] fix

---
 .../ada_round_calibration/src/resnet-18_calibration.py          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/resnet-18_calibration.py b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/resnet-18_calibration.py
index d41a31b40..e5b2ba283 100644
--- a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/resnet-18_calibration.py
+++ b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/resnet-18_calibration.py
@@ -132,7 +132,7 @@ def main():
     # Quantize configurations
     images, _ = get_labels_from_txt(LABEL_FILE)
     images = [os.path.join(IMG_DIR, image) for image in images]
-    input_data = prepare_image_input(images[:32])
+    input_data = prepare_image_input(images)
     if torch.cuda.is_available():
         input_data = (input_data.to('cuda'))
         model.to('cuda')
-- 
Gitee


From e08579f234ac4b94ae2b976863e08725cae0cf34 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com>
Date: Sat, 15 Feb 2025 16:45:37 +0800
Subject: [PATCH 04/21] fix

---
 .../ada_round_calibration/README_CN.md           |  6 +++---
 .../src/config/ada_round.cfg                     |  6 +++++-
 .../ada_round_calibration/src/resnet.py          | 16 ++++++++++++++++
 3 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/README_CN.md b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/README_CN.md
index 5c8c45ad8..d4db83bbf 100644
--- a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/README_CN.md
+++ b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/README_CN.md
@@ -1,6 +1,6 @@
-# ResNet-101
+# ResNet-18
 
-## 1. HIF8/FP8校准
+## 1. ada round量化校准
 
 ### 1.1 量化前提
 
@@ -27,7 +27,7 @@
 |common_config.ada_quantize.beta_range_start|uint|beta衰减范围|20|/|
 |common_config.ada_quantize.beta_range_end|uint|beta衰减范围|2|/|
 
-更多参数配置请参考[资料](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/800alpha003/devaids/devtools/amct/atlasamct_16_0131.html)
+更多参数配置请参考[官方文档](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/800alpha003/devaids/devtools/amct/atlasamct_16_0131.html)
 
 
 ### 1.3 量化示例
diff --git a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/config/ada_round.cfg b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/config/ada_round.cfg
index a9d4989d8..768b8deae 100644
--- a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/config/ada_round.cfg
+++ b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/config/ada_round.cfg
@@ -1,6 +1,10 @@
 common_config : {
     ada_quantize : {
-        num_iteration : 2000
+        num_iteration : 10000
+        warm_start : 0.2
+        reg_param : 0.01
+        beta_range_start : 20
+        beta_range_end : 2
         channel_wise : false
     }
 }
diff --git a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/resnet.py b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/resnet.py
index 475aaafb9..483755af8 100644
--- a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/resnet.py
+++ b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/resnet.py
@@ -1,3 +1,19 @@
+"""
+# Copyright 2025 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License. 
+"""
+
 import os
 import torch
 import torch.nn as nn
-- 
Gitee


From 83b716f8de167ba5bb5a78cfabebc9f62c5f51a6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com>
Date: Tue, 18 Feb 2025 08:39:41 +0800
Subject: [PATCH 05/21] fix

---
 .../amct_pytorch/ada_round_calibration/README_CN.md       | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/README_CN.md b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/README_CN.md
index d4db83bbf..239cc81b3 100644
--- a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/README_CN.md
+++ b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/README_CN.md
@@ -56,6 +56,14 @@ CUDA_VISIBLE_DEVICES=0 python ./src/resnet-18_calibration.py
 > + 其他 Device ID使用 GPU 进行量化，具体 ID 请以用户实际环境为准。
 > 
 
+若出现如下信息，则说明量化成功：
+
+```none
+INFO - [AMCT]:[Utils]: The model file is saved in ./outputs/calibration/resnet-18_deploy_model.onnx
+INFO - [AMCT]:[Utils]: The model file is saved in ./outputs/calibration/resnet-18_fake_quant_model.onnx
+[INFO] ResNet18 before quantize top1:    0.84375 top5:    0.9625
+[INFO] ResNet18 after quantize  top1:    0.84375 top5:   0.95625
+```
 
 ### 1.4 量化结果
 
-- 
Gitee


From 9007bbf7c6b962bfcd1ea136f418a17134ab13cd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com>
Date: Thu, 20 Feb 2025 09:47:45 +0800
Subject: [PATCH 06/21] fix

---
 .../9_amct/amct_pytorch/ada_round_calibration/README_CN.md    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/README_CN.md b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/README_CN.md
index 239cc81b3..0c5f7b7cb 100644
--- a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/README_CN.md
+++ b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/README_CN.md
@@ -24,8 +24,8 @@
 |common_config.ada_quantize.warm_start|float|预热因子|0.2|(0,1)|
 |common_config.ada_quantize.reg_param|float|正则化参数|0.01|(0,1)|
 |common_config.ada_quantize.channel_wise|bool|是否开启pre channel|true|false/true|
-|common_config.ada_quantize.beta_range_start|uint|beta衰减范围|20|/|
-|common_config.ada_quantize.beta_range_end|uint|beta衰减范围|2|/|
+|common_config.ada_quantize.beta_range_start|uint|退火起始参数|20|/|
+|common_config.ada_quantize.beta_range_end|uint|退火终止参数|2|/|
 
 更多参数配置请参考[官方文档](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/800alpha003/devaids/devtools/amct/atlasamct_16_0131.html)
 
-- 
Gitee


From ed5b21111b910f91145e41a6f4a74497d157d3fb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com>
Date: Thu, 20 Feb 2025 10:34:48 +0800
Subject: [PATCH 07/21] fix

---
 .../9_amct/amct_pytorch/ada_round_calibration/README_CN.md    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/README_CN.md b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/README_CN.md
index 0c5f7b7cb..825cd0471 100644
--- a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/README_CN.md
+++ b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/README_CN.md
@@ -24,8 +24,8 @@
 |common_config.ada_quantize.warm_start|float|预热因子|0.2|(0,1)|
 |common_config.ada_quantize.reg_param|float|正则化参数|0.01|(0,1)|
 |common_config.ada_quantize.channel_wise|bool|是否开启pre channel|true|false/true|
-|common_config.ada_quantize.beta_range_start|uint|退火起始参数|20|/|
-|common_config.ada_quantize.beta_range_end|uint|退火终止参数|2|/|
+|common_config.ada_quantize.beta_range_start|uint|退火起始参数|20|>beta_range_end|
+|common_config.ada_quantize.beta_range_end|uint|退火终止参数|2|>0|
 
 更多参数配置请参考[官方文档](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/800alpha003/devaids/devtools/amct/atlasamct_16_0131.html)
 
-- 
Gitee


From 8612ca79f28606ad15393d039dbb4ddc89a5b6f9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com>
Date: Thu, 20 Feb 2025 11:25:39 +0800
Subject: [PATCH 08/21] delete

---
 .../9_amct/amct_pytorch/ada_round_calibration/data/.keep          | 0
 .../9_amct/amct_pytorch/ada_round_calibration/model/.keep         | 0
 .../9_amct/amct_pytorch/ada_round_calibration/src/__init__.py     | 0
 3 files changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/data/.keep
 delete mode 100644 python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/model/.keep
 delete mode 100644 python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/__init__.py

diff --git a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/data/.keep b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/data/.keep
deleted file mode 100644
index e69de29bb..000000000
diff --git a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/model/.keep b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/model/.keep
deleted file mode 100644
index e69de29bb..000000000
diff --git a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/__init__.py b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/__init__.py
deleted file mode 100644
index e69de29bb..000000000
-- 
Gitee


From 702da07eecc5f0235d84e3633f4f690e1bcda7c6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com>
Date: Thu, 20 Feb 2025 11:37:52 +0800
Subject: [PATCH 09/21] fix

---
 .../9_amct/amct_pytorch/ada_round_calibration/data/.gitkeep       | 0
 .../9_amct/amct_pytorch/ada_round_calibration/model/.gitkeep      | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/data/.gitkeep
 create mode 100644 python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/model/.gitkeep

diff --git a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/data/.gitkeep b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/data/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/model/.gitkeep b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/model/.gitkeep
new file mode 100644
index 000000000..e69de29bb
-- 
Gitee


From c42b12b5b39f27b81b63d0a7f279d880c16eb3a4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com>
Date: Wed, 23 Apr 2025 14:28:19 +0800
Subject: [PATCH 10/21] fp8/hi8 weight only

---
 .../hif8_fp8_weight_quantization/README_CN.md |  43 +++++
 .../requirements.txt                          |   7 +
 .../src/quantization.cfg                      |   5 +
 .../src/run_llama7b_quantization.py           | 153 ++++++++++++++++++
 .../hif8_fp8_weight_quantization/src/utils.py |  82 ++++++++++
 5 files changed, 290 insertions(+)
 create mode 100644 python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md
 create mode 100644 python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/requirements.txt
 create mode 100644 python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/quantization.cfg
 create mode 100644 python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py
 create mode 100644 python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/utils.py

diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md
new file mode 100644
index 000000000..e7d9e9df8
--- /dev/null
+++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md
@@ -0,0 +1,43 @@
+# MXFP4量化
+
+## 1 MXFP4量化前提
+
+### 1.1 安装依赖
+
+本sample依赖包可参考[requirements.txt](requirements.txt)
+
+### 1.2 模型和数据集准备
+
+本sample以Llama2-7b模型，pileval和wikitext2数据集为示例，请用户自行下载，并适配utils.py文件中加载数据集和模型的路径。当前sample中数据集保存目录需根据实际保存目录修改。
+
+### 1.3 简易量化配置
+./src/quantization.cfg文件为用户自定义的简易量化配置，具体表示信息如下：
+
+| 字段 |类型| 说明 | 默认值 | 取值范围 |
+|:--| :-: | :-- | :-: | :-: |
+|skip_layers|str|跳过量化的层 |/|/|跳过量化层支持模糊匹配，当配置字符串为层名字串，或与层名一致时，跳过该层量化，不生成量化配置。字符串必须包含数字或字母|
+|weight_only_config.weight_compress_only|bool|是否为仅权重量化|False|True/False|
+|weight_only_config.wts_type|enum|量化后权重类型|INT8|INT8/MXFP4_E2M1/HIFLOAT8/FLOAT8_E4M3FN|
+
+> 当前quantization.cfg文件中weight_only_config.wts_type设置的值为FLOAT8_E4M3，如果想HIFLOAT8仅权重量化，请适配修改quantization.cfg
+## 2 FLOAT8_E4M3FN量化示例
+
+### 2.1 使用接口方式调用
+
+请在当前目录执行如下命令运行示例程序，用户需根据实际情况修改示例程序中的模型和数据集路径：
+
+`CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 python3 src/run_llama7b_quantization.py`
+
+若出现如下信息，则说明量化成功：
+
+```none
+Test time taken:  1.0 min  59.24865388870239 s
+Score:  5.670858383178711
+```
+
+推理成功后，在当前目录会生成量化日志文件./amct_log/amct_pytorch.log和./outputs文件夹，该文件夹内包含以下内容：
+
+- config.json：量化配置文件，描述了如何对模型中的每一层进行量化。
+- record.txt：量化因子记录文件。
+
+> 如果outputs目录下已经存在量化配置文件或量化因子记录文件，再次运行示例程序时，如果新生成的文件与已有文件同名，则会覆盖已有的量化配置文件或量化因子记录文件。
diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/requirements.txt b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/requirements.txt
new file mode 100644
index 000000000..55441d062
--- /dev/null
+++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/requirements.txt
@@ -0,0 +1,7 @@
+torch==2.1.0
+transformers==4.40.0
+accelerate==0.30.1
+datasets==2.19.1
+sentencepiece==0.2.0
+numpy==1.23.5
+protobuf==3.20.2
\ No newline at end of file
diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/quantization.cfg b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/quantization.cfg
new file mode 100644
index 000000000..179e23bac
--- /dev/null
+++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/quantization.cfg
@@ -0,0 +1,5 @@
+skip_layers: "lm_head"
+weight_only_config: {
+    weight_compress_only: True
+    wts_type: FLOAT8_E4M3
+}
\ No newline at end of file
diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py
new file mode 100644
index 000000000..8ad31a556
--- /dev/null
+++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py
@@ -0,0 +1,153 @@
+"""
+# Copyright 2025 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License. 
+"""
+
+
+import os
+import copy
+import time
+import tqdm
+import torch
+import torch.nn as nn
+from transformers import AutoTokenizer, AutoConfig
+from accelerate import infer_auto_device_map, dispatch_model
+from accelerate.utils.modeling import get_balanced_memory
+
+from utils import get_loaders,  get_llama2, get_calib_dataset
+import amct_pytorch as amct
+
+
+def build_model_and_enc(model, model_path, gpu_num):
+    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+    if "mpt" in config.__class__.__name__.lower():
+        enc = AutoTokenizer.from_pretrained(
+            config.tokenizer_name, trust_remote_code=True
+        )
+    else:
+        enc = AutoTokenizer.from_pretrained(
+            model_path, use_fast=False, trust_remote_code=True
+        )
+
+    # Move the model to GPU (as much as possible) for LM evaluation
+    # max_memory = ['0:16GiB', '1:16GiB','2:16GiB', 'cpu:30GiB'], '0' means the first GPU that you specify.
+    # I don't recommend use 16GiB, we need to reserve some space for other tensors during calculation
+    # please see the recommand memeory allocation in the Word file
+    # Adjust the max_size accroding to the real situation
+    # a clever way:
+
+    max_memory = []
+    for i in range(gpu_num):
+        max_memory.append(f'{i}:12GiB')
+    max_memory.append('cpu:80GiB')
+    print('Max_memory allocation: \n', max_memory)
+
+    max_memory = [v.split(":") for v in (max_memory or [])]
+    max_memory = {(int(k) if k.isdigit() else k): v for k, v in max_memory}
+    kwargs = {
+        "max_memory": get_balanced_memory(
+            model, max_memory if len(max_memory) > 0 else None
+        )
+    }
+    model.tie_weights()
+    device_map = infer_auto_device_map(
+        model,
+        no_split_module_classes=[
+            "LlamaDecoderLayer",
+        ],
+        **kwargs,
+    )
+    model = dispatch_model(model, device_map=device_map, 
+        offload_dir=os.path.join(model_path, 'offload_dir'))
+
+    return model, enc
+
+if __name__ == '__main__':
+    model, model_path = get_llama2('7b')
+    model = model.eval()
+    copied_model = copy.deepcopy(model)
+    gpu_num = torch.cuda.device_count()
+    model, enc = build_model_and_enc(model, model_path, gpu_num)
+
+    proto_path = './src/quantization.cfg'
+    config_file = './output/config.json'
+    record_file = './output/record.txt'
+
+    test_start_time = time.time()
+    # Phase1: generate quant config json
+    amct.create_post_quant_config(config_file,
+                             model,
+                             config_defination=proto_path)
+    
+    # Phase2: do weights calibration and generate calibration model
+    samples = get_calib_dataset(
+        data="pileval", tokenizer=enc, n_samples=512, block_size=256
+    )
+    samples = torch.cat(samples, dim=0)[:1,:]
+    model.config.use_cache = False
+    post_quant_model = amct.create_post_quant_model(config_file,
+                                                    record_file,
+                                                    model)
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    post_quant_model.config.use_cache = False
+    with torch.no_grad():
+        post_quant_model(samples.to(next(post_quant_model.parameters()).device))
+        torch.cuda.empty_cache()
+    test_end_time = time.time()
+    total_time = test_end_time - test_start_time
+    print('Calibration time taken: ', total_time // 60, 'min ', total_time%60, 's')
+    # save memory, del unuse model
+    del post_quant_model
+    
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    model, enc = build_model_and_enc(copied_model, model_path, gpu_num)
+    
+    # Phase3: save fakequant model
+    testenc = get_loaders(dataset_name='wikitext2',
+                        enc=enc,
+                        seqlen=model.seqlen)
+
+    testenc = testenc.input_ids.to(model.device)
+    fake_quant_model = amct.save_post_quant_model(record_file, model, mode='fakequant')
+    nsamples = testenc.numel() // model.seqlen
+    
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    
+    # Phase4: Test ppl result
+    nlls = []
+    test_start_time = time.time()
+    for i in tqdm.tqdm(range(nsamples), desc="evaluating..."):
+        batch = testenc[:, (i * model.seqlen) : ((i + 1) * model.seqlen)].to(
+            model.device
+        )
+        with torch.no_grad():
+            lm_logits = fake_quant_model(batch).logits
+        shift_logits = lm_logits[:, :-1, :].contiguous().float()
+        shift_labels = testenc[:, (i * model.seqlen) : ((i + 1) * model.seqlen)][:, 1:]
+        loss_fct = nn.CrossEntropyLoss()
+        loss = loss_fct(
+            shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
+        )
+        neg_log_likelihood = loss.float() * model.seqlen
+        nlls.append(neg_log_likelihood)
+    test_end_time = time.time()
+
+    ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen))
+
+    total_time = test_end_time - test_start_time
+    print('Test time taken: ', total_time // 60, 'min ', total_time%60, 's'  )
+    print('Score: ', ppl.item())
\ No newline at end of file
diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/utils.py b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/utils.py
new file mode 100644
index 000000000..af20318be
--- /dev/null
+++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/utils.py
@@ -0,0 +1,82 @@
+"""
+# Copyright 2025 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License. 
+"""
+
+import torch
+import torch.nn as nn
+from datasets import load_dataset,load_from_disk
+
+def get_llama2(model, seqlen=2048):
+    '''If model is specified from ['7b', '13b', '70b'], then we load official pretrained model;
+       If you want to load checkpoints other than the official ones, please specifiy the model path,
+       otherwise please choose from ['7b', '13b', '70b'] for better clarity
+    '''
+
+    def skip(*args, **kwargs):
+        pass
+
+    if model in ['7b', '13b', '70b']:
+        model_path = f'/data/Models/pytorch/Llama2/Llama2_{model}_hf'
+        print(f'Getting official pretrained Llama2-{model}')
+    else:
+        model_path = model
+    torch.nn.init.kaiming_uniform_ = skip
+    torch.nn.init.uniform_ = skip
+    torch.nn.init.normal_ = skip
+    from transformers import LlamaForCausalLM
+    
+    model = LlamaForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, offload_folder="offload/")
+
+    model.seqlen = seqlen
+    return model, model_path
+
+
+def get_loaders(dataset_name: str, enc, seqlen):
+    if dataset_name == 'wikitext2':
+        print('Loading dataset: Wikitext2')
+        testenc = load_dataset('/data/Datasets/wikitext/wikitext-2-raw-v1/wikitext-2-raw/wikiscript.py', 'wikitext-2-raw-v1', split='test', trust_remote_code=True)
+        testenc = enc("\n\n".join(testenc["text"]), return_tensors="pt")
+    
+    return testenc
+
+
+def get_calib_dataset(data="pileval", tokenizer=None, n_samples=512, block_size=512):
+    if data == "pileval":
+        dataset = load_from_disk('/pile_val_backup')
+    else:
+        raise NotImplementedError
+    dataset = dataset.shuffle(seed=42)
+    samples = []
+    n_run = 0
+    for data in dataset:
+        line = data["text"]
+        line = line.strip()
+        line_encoded = tokenizer.encode(line)
+        if len(line_encoded) > 512:
+            continue
+        sample = torch.tensor([line_encoded])
+        if sample.numel() == 0:
+            continue
+        samples.append(sample)
+        n_run += 1
+        if n_run == n_samples:
+            break
+    # now concatenate all samples and split according to block size
+    cat_samples = torch.cat(samples, dim=1)
+    n_split = cat_samples.shape[1] // block_size
+    print(f" * Split into {n_split} blocks")
+    return [
+        cat_samples[:, i * block_size : (i + 1) * block_size] for i in range(n_split)
+    ]
-- 
Gitee


From 39e8781f5fcecb4edb422e617beb36de55aa85f1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com>
Date: Wed, 23 Apr 2025 15:40:08 +0800
Subject: [PATCH 11/21] fix

---
 .../hif8_fp8_weight_quantization/README_CN.md             | 8 ++++----
 .../hif8_fp8_weight_quantization/src/quantization.cfg     | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md
index e7d9e9df8..a0b2f11c9 100644
--- a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md
+++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md
@@ -19,7 +19,7 @@
 |weight_only_config.weight_compress_only|bool|是否为仅权重量化|False|True/False|
 |weight_only_config.wts_type|enum|量化后权重类型|INT8|INT8/MXFP4_E2M1/HIFLOAT8/FLOAT8_E4M3FN|
 
-> 当前quantization.cfg文件中weight_only_config.wts_type设置的值为FLOAT8_E4M3，如果想HIFLOAT8仅权重量化，请适配修改quantization.cfg
+> 当前quantization.cfg文件中weight_only_config.wts_type设置的值为FLOAT8_E4M3FN，如果想HIFLOAT8仅权重量化，请适配修改quantization.cfg
 ## 2 FLOAT8_E4M3FN量化示例
 
 ### 2.1 使用接口方式调用
@@ -31,11 +31,11 @@
 若出现如下信息，则说明量化成功：
 
 ```none
-Test time taken:  1.0 min  59.24865388870239 s
-Score:  5.670858383178711
+Test time taken:  1.0 min  38.24865388870239 s
+Score:  5.481424331665039
 ```
 
-推理成功后，在当前目录会生成量化日志文件./amct_log/amct_pytorch.log和./outputs文件夹，该文件夹内包含以下内容：
+推理成功后，在当前目录会生成量化日志文件./amct_log/amct_pytorch.log和./output文件夹，该文件夹内包含以下内容：
 
 - config.json：量化配置文件，描述了如何对模型中的每一层进行量化。
 - record.txt：量化因子记录文件。
diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/quantization.cfg b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/quantization.cfg
index 179e23bac..2d8b3dcc3 100644
--- a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/quantization.cfg
+++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/quantization.cfg
@@ -1,5 +1,5 @@
 skip_layers: "lm_head"
 weight_only_config: {
     weight_compress_only: True
-    wts_type: FLOAT8_E4M3
+    wts_type: FLOAT8_E4M3FN
 }
\ No newline at end of file
-- 
Gitee


From fcac690da83206a5dab5e41a43842026f0e7c162 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com>
Date: Thu, 24 Apr 2025 09:06:01 +0800
Subject: [PATCH 12/21] FIX

---
 .../amct_pytorch/hif8_fp8_weight_quantization/README_CN.md  | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md
index a0b2f11c9..e63515c06 100644
--- a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md
+++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md
@@ -1,6 +1,6 @@
-# MXFP4量化
+# FP8/HIF8量化
 
-## 1 MXFP4量化前提
+## 1 FP8/HIF8量化前提
 
 ### 1.1 安装依赖
 
@@ -15,7 +15,7 @@
 
 | 字段 |类型| 说明 | 默认值 | 取值范围 |
 |:--| :-: | :-- | :-: | :-: |
-|skip_layers|str|跳过量化的层 |/|/|跳过量化层支持模糊匹配，当配置字符串为层名字串，或与层名一致时，跳过该层量化，不生成量化配置。字符串必须包含数字或字母|
+|skip_layers|str|跳过量化的层 |/|/|
 |weight_only_config.weight_compress_only|bool|是否为仅权重量化|False|True/False|
 |weight_only_config.wts_type|enum|量化后权重类型|INT8|INT8/MXFP4_E2M1/HIFLOAT8/FLOAT8_E4M3FN|
 
-- 
Gitee


From 1632297a55aa637d56b09befdb9198cd76cc9611 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com>
Date: Fri, 25 Apr 2025 09:55:10 +0800
Subject: [PATCH 13/21] fix

---
 .../src/run_llama7b_quantization.py                            | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py
index 8ad31a556..092238d22 100644
--- a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py
+++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py
@@ -104,7 +104,8 @@ if __name__ == '__main__':
     post_quant_model.config.use_cache = False
     with torch.no_grad():
         post_quant_model(samples.to(next(post_quant_model.parameters()).device))
-        torch.cuda.empty_cache()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
     test_end_time = time.time()
     total_time = test_end_time - test_start_time
     print('Calibration time taken: ', total_time // 60, 'min ', total_time%60, 's')
-- 
Gitee


From 06468d2222c445874f712f3be14812aa5ba928d9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com>
Date: Fri, 25 Apr 2025 12:03:02 +0800
Subject: [PATCH 14/21] fix

---
 .../amct_pytorch/hif8_fp8_weight_quantization/README_CN.md    | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md
index e63515c06..5f6358064 100644
--- a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md
+++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md
@@ -19,8 +19,10 @@
 |weight_only_config.weight_compress_only|bool|是否为仅权重量化|False|True/False|
 |weight_only_config.wts_type|enum|量化后权重类型|INT8|INT8/MXFP4_E2M1/HIFLOAT8/FLOAT8_E4M3FN|
 
-> 当前quantization.cfg文件中weight_only_config.wts_type设置的值为FLOAT8_E4M3FN，如果想HIFLOAT8仅权重量化，请适配修改quantization.cfg
 ## 2 FLOAT8_E4M3FN量化示例
+> 当前quantization.cfg文件中weight_only_config.wts_type设置的值为FLOAT8_E4M3FN，如果需要HIFLOAT8仅权重量化，请适配修改quantization.cfg
+
+> 如果要验证deploy模型，需要设置save_post_quant_model接口中参数mode为'deploy'，并将生成的部署模型搬到npu上进行推理
 
 ### 2.1 使用接口方式调用
 
-- 
Gitee


From 0f2773106d5e6cfb59b995347147705005764677 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com>
Date: Fri, 25 Apr 2025 14:07:52 +0800
Subject: [PATCH 15/21] fix

---
 .../amct_pytorch/hif8_fp8_weight_quantization/README_CN.md      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md
index 5f6358064..04b9c0973 100644
--- a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md
+++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md
@@ -34,7 +34,7 @@
 
 ```none
 Test time taken:  1.0 min  38.24865388870239 s
-Score:  5.481424331665039
+Score:  5.48
 ```
 
 推理成功后，在当前目录会生成量化日志文件./amct_log/amct_pytorch.log和./output文件夹，该文件夹内包含以下内容：
-- 
Gitee


From f2f3fef03944e4175893488799fa23667f2cdb04 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com>
Date: Tue, 29 Apr 2025 11:38:43 +0800
Subject: [PATCH 16/21] fix npu

---
 .../hif8_fp8_weight_quantization/README_CN.md | 15 ++++++---
 .../src/run_llama7b_quantization.py           | 31 +++++++++++++------
 .../hif8_fp8_weight_quantization/src/utils.py | 26 ++++++----------
 3 files changed, 42 insertions(+), 30 deletions(-)

diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md
index 04b9c0973..1b575ad98 100644
--- a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md
+++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md
@@ -8,7 +8,7 @@
 
 ### 1.2 模型和数据集准备
 
-本sample以Llama2-7b模型，pileval和wikitext2数据集为示例，请用户自行下载，并适配utils.py文件中加载数据集和模型的路径。当前sample中数据集保存目录需根据实际保存目录修改。
+本sample以Llama2-7b模型，pileval和wikitext2数据集为示例，请用户自行下载。
 
 ### 1.3 简易量化配置
 ./src/quantization.cfg文件为用户自定义的简易量化配置，具体表示信息如下：
@@ -22,13 +22,20 @@
 ## 2 FLOAT8_E4M3FN量化示例
 > 当前quantization.cfg文件中weight_only_config.wts_type设置的值为FLOAT8_E4M3FN，如果需要HIFLOAT8仅权重量化，请适配修改quantization.cfg
 
-> 如果要验证deploy模型，需要设置save_post_quant_model接口中参数mode为'deploy'，并将生成的部署模型搬到npu上进行推理
 
 ### 2.1 使用接口方式调用
 
-请在当前目录执行如下命令运行示例程序，用户需根据实际情况修改示例程序中的模型和数据集路径：
+请在当前目录执行如下命令运行示例程序
 
-`CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 python3 src/run_llama7b_quantization.py`
+验证fakequant模型脚本：
+
+`CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 python3 src/run_llama7b_quantization.py --test_on_npu_flag=false --calibration_data=/pile_val_backup/ --verify_data=/data/Datasets/wikitext/wikitext-2-raw-v1/wikitext-2-raw/wikiscript.py --model=/data/Models/pytorch/Llama2/Llama2_7b_hf`
+
+验证deploy模型脚本：
+
+`python3 src/run_llama7b_quantization.py --test_on_npu_flag=true`
+
+> test_on_npu_flag参数表明是否生成部署模型在npu上推理，calibration_data参数为校准集路径，verify_data为验证集的路径，model为模型存放路径
 
 若出现如下信息，则说明量化成功：
 
diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py
index 092238d22..4380377ac 100644
--- a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py
+++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py
@@ -14,7 +14,7 @@
 # limitations under the License. 
 """
 
-
+import argparse
 import os
 import copy
 import time
@@ -74,7 +74,14 @@ def build_model_and_enc(model, model_path, gpu_num):
     return model, enc
 
 if __name__ == '__main__':
-    model, model_path = get_llama2('7b')
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--test_on_npu_flag', type=lambda x: (str(x).lower() == 'true'))
+    parser.add_argument('--calibration_data', type=str, default="/pile_val_backup/")
+    parser.add_argument('--verify_data', type=str, default="/data/Datasets/wikitext/wikitext-2-raw-v1/wikitext-2-raw/wikiscript.py")
+    parser.add_argument('--model', type=str, default="/data/Models/pytorch/Llama2/Llama2_7b_hf")
+
+    args = parser.parse_args()
+    model, model_path = get_llama2(args.model)
     model = model.eval()
     copied_model = copy.deepcopy(model)
     gpu_num = torch.cuda.device_count()
@@ -92,7 +99,7 @@ if __name__ == '__main__':
     
     # Phase2: do weights calibration and generate calibration model
     samples = get_calib_dataset(
-        data="pileval", tokenizer=enc, n_samples=512, block_size=256
+        data_path=args.calibration_data, tokenizer=enc, n_samples=512, block_size=256
     )
     samples = torch.cat(samples, dim=0)[:1,:]
     model.config.use_cache = False
@@ -117,12 +124,18 @@ if __name__ == '__main__':
     model, enc = build_model_and_enc(copied_model, model_path, gpu_num)
     
     # Phase3: save fakequant model
-    testenc = get_loaders(dataset_name='wikitext2',
+    testenc = get_loaders(data_path=args.verify_data,
                         enc=enc,
                         seqlen=model.seqlen)
 
     testenc = testenc.input_ids.to(model.device)
-    fake_quant_model = amct.save_post_quant_model(record_file, model, mode='fakequant')
+
+    if args.test_on_npu_flag:
+        quant_model = amct.save_post_quant_model(record_file, model, mode='deploy')
+        quant_model = quant_model.npu()
+    else:
+        quant_model = amct.save_post_quant_model(record_file, model, mode='fakequant')
+
     nsamples = testenc.numel() // model.seqlen
     
     if torch.cuda.is_available():
@@ -133,12 +146,12 @@ if __name__ == '__main__':
     test_start_time = time.time()
     for i in tqdm.tqdm(range(nsamples), desc="evaluating..."):
         batch = testenc[:, (i * model.seqlen) : ((i + 1) * model.seqlen)].to(
-            model.device
+            quant_model.device
         )
         with torch.no_grad():
-            lm_logits = fake_quant_model(batch).logits
-        shift_logits = lm_logits[:, :-1, :].contiguous().float()
-        shift_labels = testenc[:, (i * model.seqlen) : ((i + 1) * model.seqlen)][:, 1:]
+            lm_logits = quant_model(batch).logits
+        shift_logits = lm_logits[:, :-1, :].contiguous().float().cpu()
+        shift_labels = testenc[:, (i * model.seqlen) : ((i + 1) * model.seqlen)][:, 1:].cpu()
         loss_fct = nn.CrossEntropyLoss()
         loss = loss_fct(
             shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/utils.py b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/utils.py
index af20318be..5bea4e1fc 100644
--- a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/utils.py
+++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/utils.py
@@ -18,7 +18,7 @@ import torch
 import torch.nn as nn
 from datasets import load_dataset,load_from_disk
 
-def get_llama2(model, seqlen=2048):
+def get_llama2(model_path, seqlen=2048):
     '''If model is specified from ['7b', '13b', '70b'], then we load official pretrained model;
        If you want to load checkpoints other than the official ones, please specifiy the model path,
        otherwise please choose from ['7b', '13b', '70b'] for better clarity
@@ -27,36 +27,28 @@ def get_llama2(model, seqlen=2048):
     def skip(*args, **kwargs):
         pass
 
-    if model in ['7b', '13b', '70b']:
-        model_path = f'/data/Models/pytorch/Llama2/Llama2_{model}_hf'
-        print(f'Getting official pretrained Llama2-{model}')
-    else:
-        model_path = model
     torch.nn.init.kaiming_uniform_ = skip
     torch.nn.init.uniform_ = skip
     torch.nn.init.normal_ = skip
     from transformers import LlamaForCausalLM
     
-    model = LlamaForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, offload_folder="offload/")
+    model = LlamaForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16, offload_folder="offload/")
 
     model.seqlen = seqlen
     return model, model_path
 
 
-def get_loaders(dataset_name: str, enc, seqlen):
-    if dataset_name == 'wikitext2':
-        print('Loading dataset: Wikitext2')
-        testenc = load_dataset('/data/Datasets/wikitext/wikitext-2-raw-v1/wikitext-2-raw/wikiscript.py', 'wikitext-2-raw-v1', split='test', trust_remote_code=True)
-        testenc = enc("\n\n".join(testenc["text"]), return_tensors="pt")
+def get_loaders(data_path: str, enc, seqlen):
+
+    print('Loading dataset: Wikitext2')
+    testenc = load_dataset(data_path, 'wikitext-2-raw-v1', split='test', trust_remote_code=True)
+    testenc = enc("\n\n".join(testenc["text"]), return_tensors="pt")
     
     return testenc
 
 
-def get_calib_dataset(data="pileval", tokenizer=None, n_samples=512, block_size=512):
-    if data == "pileval":
-        dataset = load_from_disk('/pile_val_backup')
-    else:
-        raise NotImplementedError
+def get_calib_dataset(data_path, tokenizer=None, n_samples=512, block_size=512):
+    dataset = load_from_disk(data_path)
     dataset = dataset.shuffle(seed=42)
     samples = []
     n_run = 0
-- 
Gitee


From 47e489f94bce556a7fc81806b8f07e7b551990a0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com>
Date: Tue, 29 Apr 2025 11:52:21 +0800
Subject: [PATCH 17/21] fix

---
 .../amct_pytorch/hif8_fp8_weight_quantization/README_CN.md  | 2 +-
 .../src/run_llama7b_quantization.py                         | 6 +++---
 .../amct_pytorch/hif8_fp8_weight_quantization/src/utils.py  | 5 -----
 3 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md
index 1b575ad98..2c5cc0108 100644
--- a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md
+++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md
@@ -31,7 +31,7 @@
 
 `CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 python3 src/run_llama7b_quantization.py --test_on_npu_flag=false --calibration_data=/pile_val_backup/ --verify_data=/data/Datasets/wikitext/wikitext-2-raw-v1/wikitext-2-raw/wikiscript.py --model=/data/Models/pytorch/Llama2/Llama2_7b_hf`
 
-验证deploy模型脚本：
+验证deploy模型脚本（需要适配npu相关环境）：
 
 `python3 src/run_llama7b_quantization.py --test_on_npu_flag=true`
 
diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py
index 4380377ac..c30010c17 100644
--- a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py
+++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py
@@ -76,9 +76,9 @@ def build_model_and_enc(model, model_path, gpu_num):
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument('--test_on_npu_flag', type=lambda x: (str(x).lower() == 'true'))
-    parser.add_argument('--calibration_data', type=str, default="/pile_val_backup/")
-    parser.add_argument('--verify_data', type=str, default="/data/Datasets/wikitext/wikitext-2-raw-v1/wikitext-2-raw/wikiscript.py")
-    parser.add_argument('--model', type=str, default="/data/Models/pytorch/Llama2/Llama2_7b_hf")
+    parser.add_argument('--calibration_data', type=str, default='/pile_val_backup/')
+    parser.add_argument('--verify_data', type=str, default='/data/Datasets/wikitext/wikitext-2-raw-v1/wikitext-2-raw/wikiscript.py')
+    parser.add_argument('--model', type=str, default='/data/Models/pytorch/Llama2/Llama2_7b_hf/')
 
     args = parser.parse_args()
     model, model_path = get_llama2(args.model)
diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/utils.py b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/utils.py
index 5bea4e1fc..586916fbd 100644
--- a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/utils.py
+++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/utils.py
@@ -19,11 +19,6 @@ import torch.nn as nn
 from datasets import load_dataset,load_from_disk
 
 def get_llama2(model_path, seqlen=2048):
-    '''If model is specified from ['7b', '13b', '70b'], then we load official pretrained model;
-       If you want to load checkpoints other than the official ones, please specifiy the model path,
-       otherwise please choose from ['7b', '13b', '70b'] for better clarity
-    '''
-
     def skip(*args, **kwargs):
         pass
 
-- 
Gitee


From 7ac2ce67740b2229a7be300d69116c892a1640f1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com>
Date: Tue, 29 Apr 2025 11:55:58 +0800
Subject: [PATCH 18/21] fix

---
 .../src/run_llama7b_quantization.py                           | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py
index c30010c17..c11e09fc6 100644
--- a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py
+++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py
@@ -76,9 +76,9 @@ def build_model_and_enc(model, model_path, gpu_num):
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument('--test_on_npu_flag', type=lambda x: (str(x).lower() == 'true'))
-    parser.add_argument('--calibration_data', type=str, default='/pile_val_backup/')
+    parser.add_argument('--calibration_data', type=str, default='/pile_val_backup')
     parser.add_argument('--verify_data', type=str, default='/data/Datasets/wikitext/wikitext-2-raw-v1/wikitext-2-raw/wikiscript.py')
-    parser.add_argument('--model', type=str, default='/data/Models/pytorch/Llama2/Llama2_7b_hf/')
+    parser.add_argument('--model', type=str, default='/data/Models/pytorch/Llama2/Llama2_7b_hf')
 
     args = parser.parse_args()
     model, model_path = get_llama2(args.model)
-- 
Gitee


From 16180368e0e692db2c38fe6bed50ee21917a0087 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com>
Date: Tue, 29 Apr 2025 14:31:00 +0800
Subject: [PATCH 19/21] fix

---
 .../src/run_llama7b_quantization.py                       | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py
index c11e09fc6..2b2f14603 100644
--- a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py
+++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py
@@ -102,17 +102,17 @@ if __name__ == '__main__':
         data_path=args.calibration_data, tokenizer=enc, n_samples=512, block_size=256
     )
     samples = torch.cat(samples, dim=0)[:1,:]
-    model.config.use_cache = False
+
     post_quant_model = amct.create_post_quant_model(config_file,
                                                     record_file,
                                                     model)
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
-    post_quant_model.config.use_cache = False
+
     with torch.no_grad():
         post_quant_model(samples.to(next(post_quant_model.parameters()).device))
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
     test_end_time = time.time()
     total_time = test_end_time - test_start_time
     print('Calibration time taken: ', total_time // 60, 'min ', total_time%60, 's')
-- 
Gitee


From 11a4afa09abbfa07071b87cbf322b9aad7100582 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com>
Date: Mon, 23 Jun 2025 20:07:53 +0800
Subject: [PATCH 20/21] add fp4

---
 .../fp4_weight_quantization/README_CN.md      |  50 ++++++
 .../fp4_weight_quantization/requirements.txt  |   7 +
 .../src/quantization.cfg                      |   8 +
 .../src/run_llama7b_quantization.py           | 162 ++++++++++++++++++
 .../fp4_weight_quantization/src/utils.py      |  69 ++++++++
 5 files changed, 296 insertions(+)
 create mode 100644 python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/README_CN.md
 create mode 100644 python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/requirements.txt
 create mode 100644 python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/quantization.cfg
 create mode 100644 python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/run_llama7b_quantization.py
 create mode 100644 python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/utils.py

diff --git a/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/README_CN.md b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/README_CN.md
new file mode 100644
index 000000000..8482006cd
--- /dev/null
+++ b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/README_CN.md
@@ -0,0 +1,50 @@
+# FP4伪量化
+
+## 1 FP4伪量化
+
+### 1.1 安装依赖
+
+本sample依赖包可参考[requirements.txt](requirements.txt)
+
+### 1.2 模型和数据集准备
+
+本sample以Llama2-7b模型，pileval和wikitext2数据集为示例，请用户自行下载。
+
+### 1.3 简易量化配置
+./src/quantization.cfg文件为用户自定义的简易量化配置，具体表示信息如下：
+
+| 字段 |类型| 说明 | 默认值 | 取值范围 |
+|:--| :-: | :-- | :-: | :-: |
+|skip_layers|str|跳过量化的层 |/|/|
+|weight_only_config.weight_compress_only|bool|是否为仅权重量化|False|True/False|
+|weight_only_config.wts_type|enum|量化后权重类型|INT8|INT8/MXFP4_E2M1/HIFLOAT8/FLOAT8_E4M3FN|
+|weight_only_config.awq_quantize.grids_num|int|awq搜索格点数量|20|/|/|
+
+## 2 FLOAT4_E2M1量化示例
+> 当前quantization.cfg文件中weight_only_config.wts_type设置的值为FLOAT4_E2M1
+
+
+### 2.1 使用接口方式调用
+
+请在当前目录执行如下命令运行示例程序
+
+验证fakequant模型脚本：
+
+`CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 python3 src/run_llama7b_quantization.py --calibration_data=/pile_val_backup/ --verify_data=/data/Datasets/wikitext/wikitext-2-raw-v1/wikitext-2-raw/wikiscript.py --model=/data/Models/pytorch/Llama2/Llama2_7b_hf`
+
+
+若出现如下信息，则说明量化成功：
+
+```none
+Test time taken:  9.0 min  38.24865388870239 s
+Score:  5.657759
+```
+
+推理成功后，在当前目录会生成量化日志文件./amct_log/amct_pytorch.log和./output文件夹，该文件夹内包含以下内容：
+
+- config.json：量化配置文件，描述了如何对模型中的每一层进行量化。
+- record.txt：量化因子记录文件。
+- awq_result.pt：存储了awq算法的的scale和clip
+- quant_factor.pt：存储量化缩放因子
+
+> 如果outputs目录下已经存在量化配置文件或量化因子记录文件，再次运行示例程序时，如果新生成的文件与已有文件同名，则会覆盖已有的量化配置文件或量化因子记录文件。
diff --git a/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/requirements.txt b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/requirements.txt
new file mode 100644
index 000000000..55441d062
--- /dev/null
+++ b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/requirements.txt
@@ -0,0 +1,7 @@
+torch==2.1.0
+transformers==4.40.0
+accelerate==0.30.1
+datasets==2.19.1
+sentencepiece==0.2.0
+numpy==1.23.5
+protobuf==3.20.2
\ No newline at end of file
diff --git a/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/quantization.cfg b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/quantization.cfg
new file mode 100644
index 000000000..a43152ad3
--- /dev/null
+++ b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/quantization.cfg
@@ -0,0 +1,8 @@
+skip_layers: "lm_head"
+weight_only_config: {
+    weight_compress_only: True
+    wts_type: FLOAT4_E2M1
+    awq_quantize:{
+        grids_num: 20
+    }
+}
\ No newline at end of file
diff --git a/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/run_llama7b_quantization.py b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/run_llama7b_quantization.py
new file mode 100644
index 000000000..76d164c0f
--- /dev/null
+++ b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/run_llama7b_quantization.py
@@ -0,0 +1,162 @@
+"""
+# Copyright 2025 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License. 
+"""
+
+import argparse
+import os
+import copy
+import time
+import tqdm
+import torch
+import torch.nn as nn
+from transformers import AutoTokenizer, AutoConfig
+from accelerate import infer_auto_device_map, dispatch_model
+from accelerate.utils.modeling import get_balanced_memory
+
+from utils import get_loaders,  get_llama2, get_calib_dataset
+import amct_pytorch as amct
+
+
+def build_model_and_enc(model, model_path, gpu_num):
+    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+    if "mpt" in config.__class__.__name__.lower():
+        enc = AutoTokenizer.from_pretrained(
+            config.tokenizer_name, trust_remote_code=True
+        )
+    else:
+        enc = AutoTokenizer.from_pretrained(
+            model_path, use_fast=False, trust_remote_code=True
+        )
+
+    # Move the model to GPU (as much as possible) for LM evaluation
+    # max_memory = ['0:16GiB', '1:16GiB','2:16GiB', 'cpu:30GiB'], '0' means the first GPU that you specify.
+    # I don't recommend use 16GiB, we need to reserve some space for other tensors during calculation
+    # please see the recommand memeory allocation in the Word file
+    # Adjust the max_size accroding to the real situation
+    # a clever way:
+
+    max_memory = []
+    for i in range(gpu_num):
+        max_memory.append(f'{i}:12GiB')
+    max_memory.append('cpu:80GiB')
+    print('Max_memory allocation: \n', max_memory)
+
+    max_memory = [v.split(":") for v in (max_memory or [])]
+    max_memory = {(int(k) if k.isdigit() else k): v for k, v in max_memory}
+    kwargs = {
+        "max_memory": get_balanced_memory(
+            model, max_memory if len(max_memory) > 0 else None
+        )
+    }
+    model.tie_weights()
+    device_map = infer_auto_device_map(
+        model,
+        no_split_module_classes=[
+            "LlamaDecoderLayer",
+        ],
+        **kwargs,
+    )
+    model = dispatch_model(model, device_map=device_map, 
+        offload_dir=os.path.join(model_path, 'offload_dir'))
+
+    return model, enc
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--calibration_data', type=str, default='/pile_val_backup')
+    parser.add_argument('--verify_data', type=str, default='/data/Datasets/wikitext/wikitext-2-raw-v1/wikitext-2-raw/wikiscript.py')
+    parser.add_argument('--model', type=str, default='/data/Models/pytorch/Llama2/Llama2_7b_hf')
+
+    args = parser.parse_args()
+    model, model_path = get_llama2(args.model)
+    model = model.eval()
+    copied_model = copy.deepcopy(model)
+    gpu_num = torch.cuda.device_count()
+    model, enc = build_model_and_enc(model, model_path, gpu_num)
+
+    proto_path = './src/quantization.cfg'
+    config_file = './output/config.json'
+    record_file = './output/record.txt'
+
+    test_start_time = time.time()
+    # Phase1: generate quant config json
+    amct.create_post_quant_config(config_file,
+                             model,
+                             config_defination=proto_path)
+    
+    # Phase2: do weights calibration and generate calibration model
+    samples = get_calib_dataset(
+        data_path=args.calibration_data, tokenizer=enc, n_samples=512, block_size=256
+    )
+    samples = torch.cat(samples, dim=0)[:1,:]
+
+    post_quant_model = amct.create_post_quant_model(config_file,
+                                                    record_file,
+                                                    model)
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+
+    with torch.no_grad():
+        post_quant_model(samples.to(next(post_quant_model.parameters()).device))
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    test_end_time = time.time()
+    total_time = test_end_time - test_start_time
+    print('Calibration time taken: ', total_time // 60, 'min ', total_time%60, 's')
+    # save memory, del unuse model
+    del post_quant_model
+    
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    model, enc = build_model_and_enc(copied_model, model_path, gpu_num)
+    
+    # Phase3: save fakequant model
+    testenc = get_loaders(data_path=args.verify_data,
+                        enc=enc,
+                        seqlen=model.seqlen)
+
+    testenc = testenc.input_ids.to(model.device)
+
+    quant_model = amct.save_post_quant_model(record_file, model, mode='fakequant')
+
+    nsamples = testenc.numel() // model.seqlen
+    
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    
+    # Phase4: Test ppl result
+    nlls = []
+    test_start_time = time.time()
+    for i in tqdm.tqdm(range(nsamples), desc="evaluating..."):
+        batch = testenc[:, (i * model.seqlen) : ((i + 1) * model.seqlen)].to(
+            quant_model.device
+        )
+        with torch.no_grad():
+            lm_logits = quant_model(batch).logits
+        shift_logits = lm_logits[:, :-1, :].contiguous().float().cpu()
+        shift_labels = testenc[:, (i * model.seqlen) : ((i + 1) * model.seqlen)][:, 1:].cpu()
+        loss_fct = nn.CrossEntropyLoss()
+        loss = loss_fct(
+            shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
+        )
+        neg_log_likelihood = loss.float() * model.seqlen
+        nlls.append(neg_log_likelihood)
+    test_end_time = time.time()
+
+    ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen))
+
+    total_time = test_end_time - test_start_time
+    print('Test time taken: ', total_time // 60, 'min ', total_time%60, 's'  )
+    print('Score: ', ppl.item())
\ No newline at end of file
diff --git a/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/utils.py b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/utils.py
new file mode 100644
index 000000000..586916fbd
--- /dev/null
+++ b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/utils.py
@@ -0,0 +1,69 @@
+"""
+# Copyright 2025 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License. 
+"""
+
+import torch
+import torch.nn as nn
+from datasets import load_dataset,load_from_disk
+
+def get_llama2(model_path, seqlen=2048):
+    def skip(*args, **kwargs):
+        pass
+
+    torch.nn.init.kaiming_uniform_ = skip
+    torch.nn.init.uniform_ = skip
+    torch.nn.init.normal_ = skip
+    from transformers import LlamaForCausalLM
+    
+    model = LlamaForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16, offload_folder="offload/")
+
+    model.seqlen = seqlen
+    return model, model_path
+
+
+def get_loaders(data_path: str, enc, seqlen):
+
+    print('Loading dataset: Wikitext2')
+    testenc = load_dataset(data_path, 'wikitext-2-raw-v1', split='test', trust_remote_code=True)
+    testenc = enc("\n\n".join(testenc["text"]), return_tensors="pt")
+    
+    return testenc
+
+
+def get_calib_dataset(data_path, tokenizer=None, n_samples=512, block_size=512):
+    dataset = load_from_disk(data_path)
+    dataset = dataset.shuffle(seed=42)
+    samples = []
+    n_run = 0
+    for data in dataset:
+        line = data["text"]
+        line = line.strip()
+        line_encoded = tokenizer.encode(line)
+        if len(line_encoded) > 512:
+            continue
+        sample = torch.tensor([line_encoded])
+        if sample.numel() == 0:
+            continue
+        samples.append(sample)
+        n_run += 1
+        if n_run == n_samples:
+            break
+    # now concatenate all samples and split according to block size
+    cat_samples = torch.cat(samples, dim=1)
+    n_split = cat_samples.shape[1] // block_size
+    print(f" * Split into {n_split} blocks")
+    return [
+        cat_samples[:, i * block_size : (i + 1) * block_size] for i in range(n_split)
+    ]
-- 
Gitee


From 8ab0d35be3876ccead6bfb06e5a43b08fa5167fb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com>
Date: Wed, 25 Jun 2025 10:28:26 +0800
Subject: [PATCH 21/21] fix

---
 .../9_amct/amct_pytorch/fp4_weight_quantization/README_CN.md    | 2 +-
 .../fp4_weight_quantization/src/run_llama7b_quantization.py     | 2 +-
 .../9_amct/amct_pytorch/fp4_weight_quantization/src/utils.py    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/README_CN.md b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/README_CN.md
index 8482006cd..93ea0a9ce 100644
--- a/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/README_CN.md
+++ b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/README_CN.md
@@ -18,7 +18,7 @@
 |skip_layers|str|跳过量化的层 |/|/|
 |weight_only_config.weight_compress_only|bool|是否为仅权重量化|False|True/False|
 |weight_only_config.wts_type|enum|量化后权重类型|INT8|INT8/MXFP4_E2M1/HIFLOAT8/FLOAT8_E4M3FN|
-|weight_only_config.awq_quantize.grids_num|int|awq搜索格点数量|20|/|/|
+|weight_only_config.awq_quantize.grids_num|uint32|awq搜索格点数量|20|/|/|
 
 ## 2 FLOAT4_E2M1量化示例
 > 当前quantization.cfg文件中weight_only_config.wts_type设置的值为FLOAT4_E2M1
diff --git a/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/run_llama7b_quantization.py b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/run_llama7b_quantization.py
index 76d164c0f..4aac4fad9 100644
--- a/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/run_llama7b_quantization.py
+++ b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/run_llama7b_quantization.py
@@ -98,7 +98,7 @@ if __name__ == '__main__':
     
     # Phase2: do weights calibration and generate calibration model
     samples = get_calib_dataset(
-        data_path=args.calibration_data, tokenizer=enc, n_samples=512, block_size=256
+        data_path=args.calibration_data, tokenizer=enc, n_samples=512, block_size=518
     )
     samples = torch.cat(samples, dim=0)[:1,:]
 
diff --git a/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/utils.py b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/utils.py
index 586916fbd..474a5b618 100644
--- a/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/utils.py
+++ b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/utils.py
@@ -27,7 +27,7 @@ def get_llama2(model_path, seqlen=2048):
     torch.nn.init.normal_ = skip
     from transformers import LlamaForCausalLM
     
-    model = LlamaForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16, offload_folder="offload/")
+    model = LlamaForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, offload_folder="offload/")
 
     model.seqlen = seqlen
     return model, model_path
-- 
Gitee