From c513fd34529b6bcbf51a4b6432a3ed0f4c29a9ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com> Date: Thu, 13 Feb 2025 20:11:23 +0800 Subject: [PATCH 01/21] adaround --- .../ada_round_calibration/README_CN.md | 72 ++++ .../ada_round_calibration/data/.keep | 0 .../ada_round_calibration/model/.keep | 0 .../ada_round_calibration/src/__init__.py | 0 .../src/config/quant.cfg | 5 + .../src/resnet-18_calibration.py | 172 +++++++++ .../ada_round_calibration/src/resnet.py | 357 ++++++++++++++++++ 7 files changed, 606 insertions(+) create mode 100644 python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/README_CN.md create mode 100644 python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/data/.keep create mode 100644 python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/model/.keep create mode 100644 python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/__init__.py create mode 100644 python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/config/quant.cfg create mode 100644 python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/resnet-18_calibration.py create mode 100644 python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/resnet.py diff --git a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/README_CN.md b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/README_CN.md new file mode 100644 index 000000000..5c8c45ad8 --- /dev/null +++ b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/README_CN.md @@ -0,0 +1,72 @@ +# ResNet-101 + +## 1. HIF8/FP8校准 + +### 1.1 量化前提 + ++ **模型准备** +请下载 [ResNet-18](https://download.pytorch.org/models/resnet18-5c106cde.pth) 模型文件并保存到`model`目录。 + + ++ **数据集准备** +使用昇腾模型压缩工具对模型完成量化后,需要对模型进行推理,以测试量化数据的精度。推理过程中需要使用与模型相匹配的数据集。请下载[测试图片](https://obs-9be7.obs.cn-east-2.myhuaweicloud.com/003_Atc_Models/AE/ATC%20Model/resnet-101_nuq/images.zip),解压后将`images`文件夹放到`data`目录。 + ++ **校准集准备** +校准集用来产生量化因子,保证精度。本 sample 校准集与数据集相同。 + +### 1.2 简易量化配置 +./config/ada_round.cfg文件为用户自定义的简易量化配置,具体表示信息如下: + + +| 字段 |类型| 说明 | 默认值 | 取值范围 | +|:--| :-: | :-- | :-: | :-: | +|common_config.ada_quantize.num_iteration|uint|adaround训练迭代次数|10000|>0| +|common_config.ada_quantize.warm_start|float|预热因子|0.2|(0,1)| +|common_config.ada_quantize.reg_param|float|正则化参数|0.01|(0,1)| +|common_config.ada_quantize.channel_wise|bool|是否开启pre channel|true|false/true| +|common_config.ada_quantize.beta_range_start|uint|beta衰减范围|20|/| +|common_config.ada_quantize.beta_range_end|uint|beta衰减范围|2|/| + +更多参数配置请参考[资料](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/800alpha003/devaids/devtools/amct/atlasamct_16_0131.html) + + +### 1.3 量化示例 + +执行量化示例前,请先检查当前目录下是否包含以下文件及目录,其中 images 文件夹内部包含有 160 张用于校准和测试的图片: + ++ [data](./data/) + + images ++ [model](./model/) + + resnet18-5c106cde.pth ++ [src](./src/) + + [config/ada_round.cfg](./src/config/ada_round.cfg) + + [\_\_init__.py](./src/__init__.py) + + [resnet-18_calibration.py](./src/resnet-18_calibration.py) + + [resnet.py](./src/resnet.py) + +请在当前目录执行如下命令运行示例程序: + +```bash +CUDA_VISIBLE_DEVICES=0 python ./src/resnet-18_calibration.py +``` + +> 其中 `CUDA_VISIBLE_DEVICES` 是必填参数,表示使用 CPU 还是 GPU 进行量化,参数取值为: +> +> + -1:使用 CPU 进行量化。 +> + 其他 Device ID使用 GPU 进行量化,具体 ID 请以用户实际环境为准。 +> + + +### 1.4 量化结果 + +量化成功后,在当前目录会生成量化日志文件 ./amct_log/amct_pytorch.log 和 ./outputs/calibration 文件夹,该文件夹内包含以下内容: + + ++ tmp: 临时文件夹 + + config.json: 量化配置文件,描述了如何对模型中的每一层进行量化。 + + record.txt: 量化因子记录文件记录量化因子。 + + modified_model.onnx: 临时模型文件,即原始的 PyTorch 模型 BN 融合后导出的 ONNX 模型文件。 ++ resnet-18_deploy_model.onnx: 量化部署模型,即量化后的可在昇腾 AI 处理器部署的模型文件。 ++ resnet-18_fake_quant_model.onnx: 量化仿真模型,即量化后的可在 ONNX 执行框架 ONNX Runtime 进行精度仿真的模型 + +> 如果量化脚本所在目录下已经存在量化配置文件,则再次调用 `create_quant_config` 接口时,如果新生成的量化配置文件与已有的文件同名,则会覆盖已有的量化配置文件,否则生成新的量化配置文件。 diff --git a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/data/.keep b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/data/.keep new file mode 100644 index 000000000..e69de29bb diff --git a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/model/.keep b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/model/.keep new file mode 100644 index 000000000..e69de29bb diff --git a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/__init__.py b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/config/quant.cfg b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/config/quant.cfg new file mode 100644 index 000000000..bdbebea42 --- /dev/null +++ b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/config/quant.cfg @@ -0,0 +1,5 @@ +common_config : { + ada_quantize : { + num_iteration : 2000 + } +} diff --git a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/resnet-18_calibration.py b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/resnet-18_calibration.py new file mode 100644 index 000000000..d41a31b40 --- /dev/null +++ b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/resnet-18_calibration.py @@ -0,0 +1,172 @@ +""" +# Copyright 2025 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + + +import os +import torch # pylint: disable=E0401 +from PIL import Image # pylint: disable=E0401 +from torchvision import transforms # pylint: disable=E0401 +import onnxruntime as ort + +import amct_pytorch as amct # pylint: disable=E0401 +from resnet import resnet18 # pylint: disable=E0401, C0415 + + +PATH = os.path.realpath('./') +IMG_DIR = os.path.join(PATH, 'data/images') +LABEL_FILE = os.path.join(IMG_DIR, 'image_label.txt') + +OUTPUTS = os.path.join(PATH, 'outputs/calibration') + +TMP = os.path.join(OUTPUTS, 'tmp') + + +def get_labels_from_txt(label_file): + """Read all images' name and label from label_file""" + images = [] + labels = [] + with open(label_file, 'r') as f: + lines = f.readlines() + for line in lines: + images.append(line.split(' ')[0]) + labels.append(int(line.split(' ')[1])) + return images, labels + + +def prepare_image_input(images): + """Read all images""" + input_tensor = torch.zeros(len(images), 3, 224, 224) # pylint: disable=E1101 + preprocess = transforms.Compose( + [transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), + transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]) + for index, image in enumerate(images): + input_image = Image.open(image).convert('RGB') + input_tensor[index, ...] = preprocess(input_image) + return input_tensor + + +def img_postprocess(probs, labels): + """Do image post-process""" + # calculate top1 and top5 accuracy + top1_get = 0 + top5_get = 0 + prob_size = probs.shape[1] + for index, label in enumerate(labels): + top5_record = (probs[index, :].argsort())[prob_size - 5: prob_size] + if label == top5_record[-1]: + top1_get += 1 + top5_get += 1 + elif label in top5_record: + top5_get += 1 + return float(top1_get) / len(labels), float(top5_get) / len(labels) + + +def model_forward(model, batch_size, iterations): + """Do pytorch model forward""" + images, labels = get_labels_from_txt(LABEL_FILE) + images = [os.path.join(IMG_DIR, image) for image in images] + top1_total = 0 + top5_total = 0 + for i in range(iterations): + input_batch = prepare_image_input(images[i * batch_size: (i + 1) * batch_size]) + # move the input and model to GPU for speed if available + if torch.cuda.is_available(): + input_batch = input_batch.to('cuda') + model.to('cuda') + + with torch.no_grad(): + output = model(input_batch) + top1, top5 = img_postprocess(output, labels[i * batch_size: (i + 1) * batch_size]) + top1_total += top1 + top5_total += top5 + print('****************iteration:{}*****************'.format(i)) + print('top1_acc:{}'.format(top1)) + print('top5_acc:{}'.format(top5)) + print('******final top1:{}'.format(top1_total / iterations)) + print('******final top5:{}'.format(top5_total / iterations)) + return top1_total / iterations, top5_total / iterations + + +def onnx_forward(onnx_model, batch_size, iterations): + """Do onnx model forward""" + ort_session = ort.InferenceSession(onnx_model) + + images, labels = get_labels_from_txt(LABEL_FILE) + images = [os.path.join(IMG_DIR, image) for image in images] + top1_total = 0 + top5_total = 0 + for i in range(iterations): + input_batch = prepare_image_input(images[i * batch_size: (i + 1) * batch_size]) + output = ort_session.run(None, {'input': input_batch.numpy()}) + top1, top5 = img_postprocess(output[0], labels[i * batch_size: (i + 1) * batch_size]) + top1_total += top1 + top5_total += top5 + print('****************iteration:{}*****************'.format(i)) + print('top1_acc:{}'.format(top1)) + print('top5_acc:{}'.format(top5)) + print('******final top1:{}'.format(top1_total / iterations)) + print('******final top5:{}'.format(top5_total / iterations)) + return top1_total / iterations, top5_total / iterations + + +def main(): + """Sample main function""" + model = resnet18(pretrained=True) + + model.eval() + ori_top1, ori_top5 = model_forward(model, batch_size=32, iterations=5) + + # Quantize configurations + images, _ = get_labels_from_txt(LABEL_FILE) + images = [os.path.join(IMG_DIR, image) for image in images] + input_data = prepare_image_input(images[:32]) + if torch.cuda.is_available(): + input_data = (input_data.to('cuda')) + model.to('cuda') + config_json_file = os.path.join(TMP, 'config.json') + skip_layers = [] + batch_num = 2 + + config_defination = os.path.join(PATH, 'src/config/ada_round.cfg') + amct.create_quant_config(config_json_file, model, input_data, skip_layers, batch_num, config_defination=config_defination) + + + # Phase1: do conv+bn fusion, weights calibration and generate + # calibration model + record_file = os.path.join(TMP, 'record.txt') + modified_model = os.path.join(TMP, 'modified_model.onnx') + calibration_model = amct.quantize_model( + config_json_file, modified_model, record_file, model, input_data, input_names=['input'], + output_names=['output'], dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}}) + + # Phase2: do calibration + model_forward(calibration_model, batch_size=32, iterations=batch_num) + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + # Phase3: save final model, one for onnx do fake quant test, one + # deploy model for ATC + result_path = os.path.join(OUTPUTS, 'resnet-18') + amct.save_model(modified_model, record_file, result_path) + + # Phase4: run fake_quant model test + quant_top1, quant_top5 = onnx_forward( + '%s_%s' % (result_path, 'fake_quant_model.onnx'), batch_size=32, iterations=5) + print('[INFO] ResNet18 before quantize top1:{:>10} top5:{:>10}'.format(ori_top1, ori_top5)) + print('[INFO] ResNet18 after quantize top1:{:>10} top5:{:>10}'.format(quant_top1, quant_top5)) + +if __name__ == '__main__': + main() diff --git a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/resnet.py b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/resnet.py new file mode 100644 index 000000000..ea1a4b56d --- /dev/null +++ b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/resnet.py @@ -0,0 +1,357 @@ +import os +import torch +import torch.nn as nn + + +__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', + 'resnet152', 'resnext50_32x4d', 'resnext101_32x8d', + 'wide_resnet50_2', 'wide_resnet101_2'] + + +model_urls = { + 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth', + 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth', + 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth', + 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth', + 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth', + 'resnext50_32x4d': 'https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth', + 'resnext101_32x8d': 'https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth', + 'wide_resnet50_2': 'https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth', + 'wide_resnet101_2': 'https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth', +} + + +def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1): + """3x3 convolution with padding""" + return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, + padding=dilation, groups=groups, bias=False, dilation=dilation) + + +def conv1x1(in_planes, out_planes, stride=1): + """1x1 convolution""" + return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False) + + +class BasicBlock(nn.Module): + """BasicBlock""" + expansion = 1 + + def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, + base_width=64, dilation=1, norm_layer=None): + super(BasicBlock, self).__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d + if groups != 1 or base_width != 64: + raise ValueError('BasicBlock only supports groups=1 and base_width=64') + if dilation > 1: + raise NotImplementedError("Dilation > 1 not supported in BasicBlock") + # Both self.conv1 and self.downsample layers downsample the input when stride != 1 + self.conv1 = conv3x3(inplanes, planes, stride) + self.bn1 = norm_layer(planes) + self.relu = nn.ReLU(inplace=True) + self.conv2 = conv3x3(planes, planes) + self.bn2 = norm_layer(planes) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + """forward""" + identity = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + out = self.relu(out) + + return out + + +class Bottleneck(nn.Module): + """ + Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2) + while original implementation places the stride at the first 1x1 convolution(self.conv1) + according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385. + This variant is also known as ResNet V1.5 and improves accuracy according to + https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch. + """ + + expansion = 4 + + def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, + base_width=64, dilation=1, norm_layer=None): + super(Bottleneck, self).__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d + width = int(planes * (base_width / 64.)) * groups + # Both self.conv2 and self.downsample layers downsample the input when stride != 1 + self.conv1 = conv1x1(inplanes, width) + self.bn1 = norm_layer(width) + self.conv2 = conv3x3(width, width, stride, groups, dilation) + self.bn2 = norm_layer(width) + self.conv3 = conv1x1(width, planes * self.expansion) + self.bn3 = norm_layer(planes * self.expansion) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + """forward""" + identity = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + out = self.relu(out) + + return out + + +class ResNet(nn.Module): + """ResNet""" + def __init__(self, block, layers, num_classes=1000, zero_init_residual=False, + groups=1, width_per_group=64, replace_stride_with_dilation=None, + norm_layer=None): + super(ResNet, self).__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d + self._norm_layer = norm_layer + + self.inplanes = 64 + self.dilation = 1 + if replace_stride_with_dilation is None: + # each element in the tuple indicates if we should replace + # the 2x2 stride with a dilated convolution instead + replace_stride_with_dilation = [False, False, False] + if len(replace_stride_with_dilation) != 3: + raise ValueError("replace_stride_with_dilation should be None " + "or a 3-element tuple, got {}".format(replace_stride_with_dilation)) + self.groups = groups + self.base_width = width_per_group + self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, + bias=False) + self.bn1 = norm_layer(self.inplanes) + self.relu = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + self.layer1 = self._make_layer(block, 64, layers[0]) + self.layer2 = self._make_layer(block, 128, layers[1], stride=2, + dilate=replace_stride_with_dilation[0]) + self.layer3 = self._make_layer(block, 256, layers[2], stride=2, + dilate=replace_stride_with_dilation[1]) + self.layer4 = self._make_layer(block, 512, layers[3], stride=2, + dilate=replace_stride_with_dilation[2]) + self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) + self.fc = nn.Linear(512 * block.expansion, num_classes) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + + # Zero-initialize the last BN in each residual branch, + # so that the residual branch starts with zeros, and each residual block behaves like an identity. + # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677 + if zero_init_residual: + for m in self.modules(): + if isinstance(m, Bottleneck): + nn.init.constant_(m.bn3.weight, 0) + elif isinstance(m, BasicBlock): + nn.init.constant_(m.bn2.weight, 0) + + def _make_layer(self, block, planes, blocks, stride=1, dilate=False): + norm_layer = self._norm_layer + downsample = None + previous_dilation = self.dilation + if dilate: + self.dilation *= stride + stride = 1 + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + conv1x1(self.inplanes, planes * block.expansion, stride), + norm_layer(planes * block.expansion), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample, self.groups, + self.base_width, previous_dilation, norm_layer)) + self.inplanes = planes * block.expansion + for _ in range(1, blocks): + layers.append(block(self.inplanes, planes, groups=self.groups, + base_width=self.base_width, dilation=self.dilation, + norm_layer=norm_layer)) + + return nn.Sequential(*layers) + + def _forward_impl(self, x): + # See note [TorchScript super()] + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.maxpool(x) + + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + + x = self.avgpool(x) + x = torch.reshape(x, (x.shape[0], x.shape[1])) + x = self.fc(x) + + return x + + def forward(self, x): + """forward""" + return self._forward_impl(x) + + +def _resnet(arch, block, layers, pretrained, progress, **kwargs): + model = ResNet(block, layers, **kwargs) + if pretrained: + model.load_state_dict(torch.load('./model/resnet101-5d3b4d8f.pth')) + return model + + +def resnet18(pretrained=False, progress=True, **kwargs): + r"""ResNet-18 model from + `"Deep Residual Learning for Image Recognition" `_ + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, progress, + **kwargs) + + +def resnet34(pretrained=False, progress=True, **kwargs): + r"""ResNet-34 model from + `"Deep Residual Learning for Image Recognition" `_ + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _resnet('resnet34', BasicBlock, [3, 4, 6, 3], pretrained, progress, + **kwargs) + + +def resnet50(pretrained=False, progress=True, **kwargs): + r"""ResNet-50 model from + `"Deep Residual Learning for Image Recognition" `_ + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _resnet('resnet50', Bottleneck, [3, 4, 6, 3], pretrained, progress, + **kwargs) + + +def resnet101(pretrained=False, progress=True, **kwargs): + r"""ResNet-101 model from + `"Deep Residual Learning for Image Recognition" `_ + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _resnet('resnet101', Bottleneck, [3, 4, 23, 3], pretrained, progress, + **kwargs) + + +def resnet152(pretrained=False, progress=True, **kwargs): + r"""ResNet-152 model from + `"Deep Residual Learning for Image Recognition" `_ + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _resnet('resnet152', Bottleneck, [3, 8, 36, 3], pretrained, progress, + **kwargs) + + +def resnext50_32x4d(pretrained=False, progress=True, **kwargs): + r"""ResNeXt-50 32x4d model from + `"Aggregated Residual Transformation for Deep Neural Networks" `_ + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + kwargs['groups'] = 32 + kwargs['width_per_group'] = 4 + return _resnet('resnext50_32x4d', Bottleneck, [3, 4, 6, 3], + pretrained, progress, **kwargs) + + +def resnext101_32x8d(pretrained=False, progress=True, **kwargs): + r"""ResNeXt-101 32x8d model from + `"Aggregated Residual Transformation for Deep Neural Networks" `_ + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + kwargs['groups'] = 32 + kwargs['width_per_group'] = 8 + return _resnet('resnext101_32x8d', Bottleneck, [3, 4, 23, 3], + pretrained, progress, **kwargs) + + +def wide_resnet50_2(pretrained=False, progress=True, **kwargs): + r"""Wide ResNet-50-2 model from + `"Wide Residual Networks" `_ + + The model is the same as ResNet except for the bottleneck number of channels + which is twice larger in every block. The number of channels in outer 1x1 + convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048 + channels, and in Wide ResNet-50-2 has 2048-1024-2048. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + kwargs['width_per_group'] = 64 * 2 + return _resnet('wide_resnet50_2', Bottleneck, [3, 4, 6, 3], + pretrained, progress, **kwargs) + + +def wide_resnet101_2(pretrained=False, progress=True, **kwargs): + r"""Wide ResNet-101-2 model from + `"Wide Residual Networks" `_ + + The model is the same as ResNet except for the bottleneck number of channels + which is twice larger in every block. The number of channels in outer 1x1 + convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048 + channels, and in Wide ResNet-50-2 has 2048-1024-2048. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + kwargs['width_per_group'] = 64 * 2 + return _resnet('wide_resnet101_2', Bottleneck, [3, 4, 23, 3], + pretrained, progress, **kwargs) -- Gitee From fea7862b68fd043f713d336de5f30f134a48aa6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com> Date: Fri, 14 Feb 2025 15:36:35 +0800 Subject: [PATCH 02/21] fix --- .../src/config/{quant.cfg => ada_round.cfg} | 1 + .../9_amct/amct_pytorch/ada_round_calibration/src/resnet.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) rename python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/config/{quant.cfg => ada_round.cfg} (72%) diff --git a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/config/quant.cfg b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/config/ada_round.cfg similarity index 72% rename from python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/config/quant.cfg rename to python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/config/ada_round.cfg index bdbebea42..a9d4989d8 100644 --- a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/config/quant.cfg +++ b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/config/ada_round.cfg @@ -1,5 +1,6 @@ common_config : { ada_quantize : { num_iteration : 2000 + channel_wise : false } } diff --git a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/resnet.py b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/resnet.py index ea1a4b56d..475aaafb9 100644 --- a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/resnet.py +++ b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/resnet.py @@ -229,7 +229,7 @@ class ResNet(nn.Module): def _resnet(arch, block, layers, pretrained, progress, **kwargs): model = ResNet(block, layers, **kwargs) if pretrained: - model.load_state_dict(torch.load('./model/resnet101-5d3b4d8f.pth')) + model.load_state_dict(torch.load('./model/resnet18-5c106cde.pth')) return model -- Gitee From 550affe7439012c36c9b0389cbd81677d359b472 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com> Date: Sat, 15 Feb 2025 16:12:38 +0800 Subject: [PATCH 03/21] fix --- .../ada_round_calibration/src/resnet-18_calibration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/resnet-18_calibration.py b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/resnet-18_calibration.py index d41a31b40..e5b2ba283 100644 --- a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/resnet-18_calibration.py +++ b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/resnet-18_calibration.py @@ -132,7 +132,7 @@ def main(): # Quantize configurations images, _ = get_labels_from_txt(LABEL_FILE) images = [os.path.join(IMG_DIR, image) for image in images] - input_data = prepare_image_input(images[:32]) + input_data = prepare_image_input(images) if torch.cuda.is_available(): input_data = (input_data.to('cuda')) model.to('cuda') -- Gitee From e08579f234ac4b94ae2b976863e08725cae0cf34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com> Date: Sat, 15 Feb 2025 16:45:37 +0800 Subject: [PATCH 04/21] fix --- .../ada_round_calibration/README_CN.md | 6 +++--- .../src/config/ada_round.cfg | 6 +++++- .../ada_round_calibration/src/resnet.py | 16 ++++++++++++++++ 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/README_CN.md b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/README_CN.md index 5c8c45ad8..d4db83bbf 100644 --- a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/README_CN.md +++ b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/README_CN.md @@ -1,6 +1,6 @@ -# ResNet-101 +# ResNet-18 -## 1. HIF8/FP8校准 +## 1. ada round量化校准 ### 1.1 量化前提 @@ -27,7 +27,7 @@ |common_config.ada_quantize.beta_range_start|uint|beta衰减范围|20|/| |common_config.ada_quantize.beta_range_end|uint|beta衰减范围|2|/| -更多参数配置请参考[资料](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/800alpha003/devaids/devtools/amct/atlasamct_16_0131.html) +更多参数配置请参考[官方文档](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/800alpha003/devaids/devtools/amct/atlasamct_16_0131.html) ### 1.3 量化示例 diff --git a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/config/ada_round.cfg b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/config/ada_round.cfg index a9d4989d8..768b8deae 100644 --- a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/config/ada_round.cfg +++ b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/config/ada_round.cfg @@ -1,6 +1,10 @@ common_config : { ada_quantize : { - num_iteration : 2000 + num_iteration : 10000 + warm_start : 0.2 + reg_param : 0.01 + beta_range_start : 20 + beta_range_end : 2 channel_wise : false } } diff --git a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/resnet.py b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/resnet.py index 475aaafb9..483755af8 100644 --- a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/resnet.py +++ b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/resnet.py @@ -1,3 +1,19 @@ +""" +# Copyright 2025 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + import os import torch import torch.nn as nn -- Gitee From 83b716f8de167ba5bb5a78cfabebc9f62c5f51a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com> Date: Tue, 18 Feb 2025 08:39:41 +0800 Subject: [PATCH 05/21] fix --- .../amct_pytorch/ada_round_calibration/README_CN.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/README_CN.md b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/README_CN.md index d4db83bbf..239cc81b3 100644 --- a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/README_CN.md +++ b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/README_CN.md @@ -56,6 +56,14 @@ CUDA_VISIBLE_DEVICES=0 python ./src/resnet-18_calibration.py > + 其他 Device ID使用 GPU 进行量化,具体 ID 请以用户实际环境为准。 > +若出现如下信息,则说明量化成功: + +```none +INFO - [AMCT]:[Utils]: The model file is saved in ./outputs/calibration/resnet-18_deploy_model.onnx +INFO - [AMCT]:[Utils]: The model file is saved in ./outputs/calibration/resnet-18_fake_quant_model.onnx +[INFO] ResNet18 before quantize top1: 0.84375 top5: 0.9625 +[INFO] ResNet18 after quantize top1: 0.84375 top5: 0.95625 +``` ### 1.4 量化结果 -- Gitee From 9007bbf7c6b962bfcd1ea136f418a17134ab13cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com> Date: Thu, 20 Feb 2025 09:47:45 +0800 Subject: [PATCH 06/21] fix --- .../9_amct/amct_pytorch/ada_round_calibration/README_CN.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/README_CN.md b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/README_CN.md index 239cc81b3..0c5f7b7cb 100644 --- a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/README_CN.md +++ b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/README_CN.md @@ -24,8 +24,8 @@ |common_config.ada_quantize.warm_start|float|预热因子|0.2|(0,1)| |common_config.ada_quantize.reg_param|float|正则化参数|0.01|(0,1)| |common_config.ada_quantize.channel_wise|bool|是否开启pre channel|true|false/true| -|common_config.ada_quantize.beta_range_start|uint|beta衰减范围|20|/| -|common_config.ada_quantize.beta_range_end|uint|beta衰减范围|2|/| +|common_config.ada_quantize.beta_range_start|uint|退火起始参数|20|/| +|common_config.ada_quantize.beta_range_end|uint|退火终止参数|2|/| 更多参数配置请参考[官方文档](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/800alpha003/devaids/devtools/amct/atlasamct_16_0131.html) -- Gitee From ed5b21111b910f91145e41a6f4a74497d157d3fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com> Date: Thu, 20 Feb 2025 10:34:48 +0800 Subject: [PATCH 07/21] fix --- .../9_amct/amct_pytorch/ada_round_calibration/README_CN.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/README_CN.md b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/README_CN.md index 0c5f7b7cb..825cd0471 100644 --- a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/README_CN.md +++ b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/README_CN.md @@ -24,8 +24,8 @@ |common_config.ada_quantize.warm_start|float|预热因子|0.2|(0,1)| |common_config.ada_quantize.reg_param|float|正则化参数|0.01|(0,1)| |common_config.ada_quantize.channel_wise|bool|是否开启pre channel|true|false/true| -|common_config.ada_quantize.beta_range_start|uint|退火起始参数|20|/| -|common_config.ada_quantize.beta_range_end|uint|退火终止参数|2|/| +|common_config.ada_quantize.beta_range_start|uint|退火起始参数|20|>beta_range_end| +|common_config.ada_quantize.beta_range_end|uint|退火终止参数|2|>0| 更多参数配置请参考[官方文档](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/800alpha003/devaids/devtools/amct/atlasamct_16_0131.html) -- Gitee From 8612ca79f28606ad15393d039dbb4ddc89a5b6f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com> Date: Thu, 20 Feb 2025 11:25:39 +0800 Subject: [PATCH 08/21] delete --- .../9_amct/amct_pytorch/ada_round_calibration/data/.keep | 0 .../9_amct/amct_pytorch/ada_round_calibration/model/.keep | 0 .../9_amct/amct_pytorch/ada_round_calibration/src/__init__.py | 0 3 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/data/.keep delete mode 100644 python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/model/.keep delete mode 100644 python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/__init__.py diff --git a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/data/.keep b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/data/.keep deleted file mode 100644 index e69de29bb..000000000 diff --git a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/model/.keep b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/model/.keep deleted file mode 100644 index e69de29bb..000000000 diff --git a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/__init__.py b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/src/__init__.py deleted file mode 100644 index e69de29bb..000000000 -- Gitee From 702da07eecc5f0235d84e3633f4f690e1bcda7c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com> Date: Thu, 20 Feb 2025 11:37:52 +0800 Subject: [PATCH 09/21] fix --- .../9_amct/amct_pytorch/ada_round_calibration/data/.gitkeep | 0 .../9_amct/amct_pytorch/ada_round_calibration/model/.gitkeep | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/data/.gitkeep create mode 100644 python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/model/.gitkeep diff --git a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/data/.gitkeep b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/data/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/model/.gitkeep b/python/level1_single_api/9_amct/amct_pytorch/ada_round_calibration/model/.gitkeep new file mode 100644 index 000000000..e69de29bb -- Gitee From c42b12b5b39f27b81b63d0a7f279d880c16eb3a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com> Date: Wed, 23 Apr 2025 14:28:19 +0800 Subject: [PATCH 10/21] fp8/hi8 weight only --- .../hif8_fp8_weight_quantization/README_CN.md | 43 +++++ .../requirements.txt | 7 + .../src/quantization.cfg | 5 + .../src/run_llama7b_quantization.py | 153 ++++++++++++++++++ .../hif8_fp8_weight_quantization/src/utils.py | 82 ++++++++++ 5 files changed, 290 insertions(+) create mode 100644 python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md create mode 100644 python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/requirements.txt create mode 100644 python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/quantization.cfg create mode 100644 python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py create mode 100644 python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/utils.py diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md new file mode 100644 index 000000000..e7d9e9df8 --- /dev/null +++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md @@ -0,0 +1,43 @@ +# MXFP4量化 + +## 1 MXFP4量化前提 + +### 1.1 安装依赖 + +本sample依赖包可参考[requirements.txt](requirements.txt) + +### 1.2 模型和数据集准备 + +本sample以Llama2-7b模型,pileval和wikitext2数据集为示例,请用户自行下载,并适配utils.py文件中加载数据集和模型的路径。当前sample中数据集保存目录需根据实际保存目录修改。 + +### 1.3 简易量化配置 +./src/quantization.cfg文件为用户自定义的简易量化配置,具体表示信息如下: + +| 字段 |类型| 说明 | 默认值 | 取值范围 | +|:--| :-: | :-- | :-: | :-: | +|skip_layers|str|跳过量化的层 |/|/|跳过量化层支持模糊匹配,当配置字符串为层名字串,或与层名一致时,跳过该层量化,不生成量化配置。字符串必须包含数字或字母| +|weight_only_config.weight_compress_only|bool|是否为仅权重量化|False|True/False| +|weight_only_config.wts_type|enum|量化后权重类型|INT8|INT8/MXFP4_E2M1/HIFLOAT8/FLOAT8_E4M3FN| + +> 当前quantization.cfg文件中weight_only_config.wts_type设置的值为FLOAT8_E4M3,如果想HIFLOAT8仅权重量化,请适配修改quantization.cfg +## 2 FLOAT8_E4M3FN量化示例 + +### 2.1 使用接口方式调用 + +请在当前目录执行如下命令运行示例程序,用户需根据实际情况修改示例程序中的模型和数据集路径: + +`CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 python3 src/run_llama7b_quantization.py` + +若出现如下信息,则说明量化成功: + +```none +Test time taken: 1.0 min 59.24865388870239 s +Score: 5.670858383178711 +``` + +推理成功后,在当前目录会生成量化日志文件./amct_log/amct_pytorch.log和./outputs文件夹,该文件夹内包含以下内容: + +- config.json:量化配置文件,描述了如何对模型中的每一层进行量化。 +- record.txt:量化因子记录文件。 + +> 如果outputs目录下已经存在量化配置文件或量化因子记录文件,再次运行示例程序时,如果新生成的文件与已有文件同名,则会覆盖已有的量化配置文件或量化因子记录文件。 diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/requirements.txt b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/requirements.txt new file mode 100644 index 000000000..55441d062 --- /dev/null +++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/requirements.txt @@ -0,0 +1,7 @@ +torch==2.1.0 +transformers==4.40.0 +accelerate==0.30.1 +datasets==2.19.1 +sentencepiece==0.2.0 +numpy==1.23.5 +protobuf==3.20.2 \ No newline at end of file diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/quantization.cfg b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/quantization.cfg new file mode 100644 index 000000000..179e23bac --- /dev/null +++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/quantization.cfg @@ -0,0 +1,5 @@ +skip_layers: "lm_head" +weight_only_config: { + weight_compress_only: True + wts_type: FLOAT8_E4M3 +} \ No newline at end of file diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py new file mode 100644 index 000000000..8ad31a556 --- /dev/null +++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py @@ -0,0 +1,153 @@ +""" +# Copyright 2025 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + + +import os +import copy +import time +import tqdm +import torch +import torch.nn as nn +from transformers import AutoTokenizer, AutoConfig +from accelerate import infer_auto_device_map, dispatch_model +from accelerate.utils.modeling import get_balanced_memory + +from utils import get_loaders, get_llama2, get_calib_dataset +import amct_pytorch as amct + + +def build_model_and_enc(model, model_path, gpu_num): + config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) + if "mpt" in config.__class__.__name__.lower(): + enc = AutoTokenizer.from_pretrained( + config.tokenizer_name, trust_remote_code=True + ) + else: + enc = AutoTokenizer.from_pretrained( + model_path, use_fast=False, trust_remote_code=True + ) + + # Move the model to GPU (as much as possible) for LM evaluation + # max_memory = ['0:16GiB', '1:16GiB','2:16GiB', 'cpu:30GiB'], '0' means the first GPU that you specify. + # I don't recommend use 16GiB, we need to reserve some space for other tensors during calculation + # please see the recommand memeory allocation in the Word file + # Adjust the max_size accroding to the real situation + # a clever way: + + max_memory = [] + for i in range(gpu_num): + max_memory.append(f'{i}:12GiB') + max_memory.append('cpu:80GiB') + print('Max_memory allocation: \n', max_memory) + + max_memory = [v.split(":") for v in (max_memory or [])] + max_memory = {(int(k) if k.isdigit() else k): v for k, v in max_memory} + kwargs = { + "max_memory": get_balanced_memory( + model, max_memory if len(max_memory) > 0 else None + ) + } + model.tie_weights() + device_map = infer_auto_device_map( + model, + no_split_module_classes=[ + "LlamaDecoderLayer", + ], + **kwargs, + ) + model = dispatch_model(model, device_map=device_map, + offload_dir=os.path.join(model_path, 'offload_dir')) + + return model, enc + +if __name__ == '__main__': + model, model_path = get_llama2('7b') + model = model.eval() + copied_model = copy.deepcopy(model) + gpu_num = torch.cuda.device_count() + model, enc = build_model_and_enc(model, model_path, gpu_num) + + proto_path = './src/quantization.cfg' + config_file = './output/config.json' + record_file = './output/record.txt' + + test_start_time = time.time() + # Phase1: generate quant config json + amct.create_post_quant_config(config_file, + model, + config_defination=proto_path) + + # Phase2: do weights calibration and generate calibration model + samples = get_calib_dataset( + data="pileval", tokenizer=enc, n_samples=512, block_size=256 + ) + samples = torch.cat(samples, dim=0)[:1,:] + model.config.use_cache = False + post_quant_model = amct.create_post_quant_model(config_file, + record_file, + model) + if torch.cuda.is_available(): + torch.cuda.empty_cache() + post_quant_model.config.use_cache = False + with torch.no_grad(): + post_quant_model(samples.to(next(post_quant_model.parameters()).device)) + torch.cuda.empty_cache() + test_end_time = time.time() + total_time = test_end_time - test_start_time + print('Calibration time taken: ', total_time // 60, 'min ', total_time%60, 's') + # save memory, del unuse model + del post_quant_model + + if torch.cuda.is_available(): + torch.cuda.empty_cache() + model, enc = build_model_and_enc(copied_model, model_path, gpu_num) + + # Phase3: save fakequant model + testenc = get_loaders(dataset_name='wikitext2', + enc=enc, + seqlen=model.seqlen) + + testenc = testenc.input_ids.to(model.device) + fake_quant_model = amct.save_post_quant_model(record_file, model, mode='fakequant') + nsamples = testenc.numel() // model.seqlen + + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + # Phase4: Test ppl result + nlls = [] + test_start_time = time.time() + for i in tqdm.tqdm(range(nsamples), desc="evaluating..."): + batch = testenc[:, (i * model.seqlen) : ((i + 1) * model.seqlen)].to( + model.device + ) + with torch.no_grad(): + lm_logits = fake_quant_model(batch).logits + shift_logits = lm_logits[:, :-1, :].contiguous().float() + shift_labels = testenc[:, (i * model.seqlen) : ((i + 1) * model.seqlen)][:, 1:] + loss_fct = nn.CrossEntropyLoss() + loss = loss_fct( + shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1) + ) + neg_log_likelihood = loss.float() * model.seqlen + nlls.append(neg_log_likelihood) + test_end_time = time.time() + + ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen)) + + total_time = test_end_time - test_start_time + print('Test time taken: ', total_time // 60, 'min ', total_time%60, 's' ) + print('Score: ', ppl.item()) \ No newline at end of file diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/utils.py b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/utils.py new file mode 100644 index 000000000..af20318be --- /dev/null +++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/utils.py @@ -0,0 +1,82 @@ +""" +# Copyright 2025 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +import torch +import torch.nn as nn +from datasets import load_dataset,load_from_disk + +def get_llama2(model, seqlen=2048): + '''If model is specified from ['7b', '13b', '70b'], then we load official pretrained model; + If you want to load checkpoints other than the official ones, please specifiy the model path, + otherwise please choose from ['7b', '13b', '70b'] for better clarity + ''' + + def skip(*args, **kwargs): + pass + + if model in ['7b', '13b', '70b']: + model_path = f'/data/Models/pytorch/Llama2/Llama2_{model}_hf' + print(f'Getting official pretrained Llama2-{model}') + else: + model_path = model + torch.nn.init.kaiming_uniform_ = skip + torch.nn.init.uniform_ = skip + torch.nn.init.normal_ = skip + from transformers import LlamaForCausalLM + + model = LlamaForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, offload_folder="offload/") + + model.seqlen = seqlen + return model, model_path + + +def get_loaders(dataset_name: str, enc, seqlen): + if dataset_name == 'wikitext2': + print('Loading dataset: Wikitext2') + testenc = load_dataset('/data/Datasets/wikitext/wikitext-2-raw-v1/wikitext-2-raw/wikiscript.py', 'wikitext-2-raw-v1', split='test', trust_remote_code=True) + testenc = enc("\n\n".join(testenc["text"]), return_tensors="pt") + + return testenc + + +def get_calib_dataset(data="pileval", tokenizer=None, n_samples=512, block_size=512): + if data == "pileval": + dataset = load_from_disk('/pile_val_backup') + else: + raise NotImplementedError + dataset = dataset.shuffle(seed=42) + samples = [] + n_run = 0 + for data in dataset: + line = data["text"] + line = line.strip() + line_encoded = tokenizer.encode(line) + if len(line_encoded) > 512: + continue + sample = torch.tensor([line_encoded]) + if sample.numel() == 0: + continue + samples.append(sample) + n_run += 1 + if n_run == n_samples: + break + # now concatenate all samples and split according to block size + cat_samples = torch.cat(samples, dim=1) + n_split = cat_samples.shape[1] // block_size + print(f" * Split into {n_split} blocks") + return [ + cat_samples[:, i * block_size : (i + 1) * block_size] for i in range(n_split) + ] -- Gitee From 39e8781f5fcecb4edb422e617beb36de55aa85f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com> Date: Wed, 23 Apr 2025 15:40:08 +0800 Subject: [PATCH 11/21] fix --- .../hif8_fp8_weight_quantization/README_CN.md | 8 ++++---- .../hif8_fp8_weight_quantization/src/quantization.cfg | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md index e7d9e9df8..a0b2f11c9 100644 --- a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md +++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md @@ -19,7 +19,7 @@ |weight_only_config.weight_compress_only|bool|是否为仅权重量化|False|True/False| |weight_only_config.wts_type|enum|量化后权重类型|INT8|INT8/MXFP4_E2M1/HIFLOAT8/FLOAT8_E4M3FN| -> 当前quantization.cfg文件中weight_only_config.wts_type设置的值为FLOAT8_E4M3,如果想HIFLOAT8仅权重量化,请适配修改quantization.cfg +> 当前quantization.cfg文件中weight_only_config.wts_type设置的值为FLOAT8_E4M3FN,如果想HIFLOAT8仅权重量化,请适配修改quantization.cfg ## 2 FLOAT8_E4M3FN量化示例 ### 2.1 使用接口方式调用 @@ -31,11 +31,11 @@ 若出现如下信息,则说明量化成功: ```none -Test time taken: 1.0 min 59.24865388870239 s -Score: 5.670858383178711 +Test time taken: 1.0 min 38.24865388870239 s +Score: 5.481424331665039 ``` -推理成功后,在当前目录会生成量化日志文件./amct_log/amct_pytorch.log和./outputs文件夹,该文件夹内包含以下内容: +推理成功后,在当前目录会生成量化日志文件./amct_log/amct_pytorch.log和./output文件夹,该文件夹内包含以下内容: - config.json:量化配置文件,描述了如何对模型中的每一层进行量化。 - record.txt:量化因子记录文件。 diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/quantization.cfg b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/quantization.cfg index 179e23bac..2d8b3dcc3 100644 --- a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/quantization.cfg +++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/quantization.cfg @@ -1,5 +1,5 @@ skip_layers: "lm_head" weight_only_config: { weight_compress_only: True - wts_type: FLOAT8_E4M3 + wts_type: FLOAT8_E4M3FN } \ No newline at end of file -- Gitee From fcac690da83206a5dab5e41a43842026f0e7c162 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com> Date: Thu, 24 Apr 2025 09:06:01 +0800 Subject: [PATCH 12/21] FIX --- .../amct_pytorch/hif8_fp8_weight_quantization/README_CN.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md index a0b2f11c9..e63515c06 100644 --- a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md +++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md @@ -1,6 +1,6 @@ -# MXFP4量化 +# FP8/HIF8量化 -## 1 MXFP4量化前提 +## 1 FP8/HIF8量化前提 ### 1.1 安装依赖 @@ -15,7 +15,7 @@ | 字段 |类型| 说明 | 默认值 | 取值范围 | |:--| :-: | :-- | :-: | :-: | -|skip_layers|str|跳过量化的层 |/|/|跳过量化层支持模糊匹配,当配置字符串为层名字串,或与层名一致时,跳过该层量化,不生成量化配置。字符串必须包含数字或字母| +|skip_layers|str|跳过量化的层 |/|/| |weight_only_config.weight_compress_only|bool|是否为仅权重量化|False|True/False| |weight_only_config.wts_type|enum|量化后权重类型|INT8|INT8/MXFP4_E2M1/HIFLOAT8/FLOAT8_E4M3FN| -- Gitee From 1632297a55aa637d56b09befdb9198cd76cc9611 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com> Date: Fri, 25 Apr 2025 09:55:10 +0800 Subject: [PATCH 13/21] fix --- .../src/run_llama7b_quantization.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py index 8ad31a556..092238d22 100644 --- a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py +++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py @@ -104,7 +104,8 @@ if __name__ == '__main__': post_quant_model.config.use_cache = False with torch.no_grad(): post_quant_model(samples.to(next(post_quant_model.parameters()).device)) - torch.cuda.empty_cache() + if torch.cuda.is_available(): + torch.cuda.empty_cache() test_end_time = time.time() total_time = test_end_time - test_start_time print('Calibration time taken: ', total_time // 60, 'min ', total_time%60, 's') -- Gitee From 06468d2222c445874f712f3be14812aa5ba928d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com> Date: Fri, 25 Apr 2025 12:03:02 +0800 Subject: [PATCH 14/21] fix --- .../amct_pytorch/hif8_fp8_weight_quantization/README_CN.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md index e63515c06..5f6358064 100644 --- a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md +++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md @@ -19,8 +19,10 @@ |weight_only_config.weight_compress_only|bool|是否为仅权重量化|False|True/False| |weight_only_config.wts_type|enum|量化后权重类型|INT8|INT8/MXFP4_E2M1/HIFLOAT8/FLOAT8_E4M3FN| -> 当前quantization.cfg文件中weight_only_config.wts_type设置的值为FLOAT8_E4M3FN,如果想HIFLOAT8仅权重量化,请适配修改quantization.cfg ## 2 FLOAT8_E4M3FN量化示例 +> 当前quantization.cfg文件中weight_only_config.wts_type设置的值为FLOAT8_E4M3FN,如果需要HIFLOAT8仅权重量化,请适配修改quantization.cfg + +> 如果要验证deploy模型,需要设置save_post_quant_model接口中参数mode为'deploy',并将生成的部署模型搬到npu上进行推理 ### 2.1 使用接口方式调用 -- Gitee From 0f2773106d5e6cfb59b995347147705005764677 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com> Date: Fri, 25 Apr 2025 14:07:52 +0800 Subject: [PATCH 15/21] fix --- .../amct_pytorch/hif8_fp8_weight_quantization/README_CN.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md index 5f6358064..04b9c0973 100644 --- a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md +++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md @@ -34,7 +34,7 @@ ```none Test time taken: 1.0 min 38.24865388870239 s -Score: 5.481424331665039 +Score: 5.48 ``` 推理成功后,在当前目录会生成量化日志文件./amct_log/amct_pytorch.log和./output文件夹,该文件夹内包含以下内容: -- Gitee From f2f3fef03944e4175893488799fa23667f2cdb04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com> Date: Tue, 29 Apr 2025 11:38:43 +0800 Subject: [PATCH 16/21] fix npu --- .../hif8_fp8_weight_quantization/README_CN.md | 15 ++++++--- .../src/run_llama7b_quantization.py | 31 +++++++++++++------ .../hif8_fp8_weight_quantization/src/utils.py | 26 ++++++---------- 3 files changed, 42 insertions(+), 30 deletions(-) diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md index 04b9c0973..1b575ad98 100644 --- a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md +++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md @@ -8,7 +8,7 @@ ### 1.2 模型和数据集准备 -本sample以Llama2-7b模型,pileval和wikitext2数据集为示例,请用户自行下载,并适配utils.py文件中加载数据集和模型的路径。当前sample中数据集保存目录需根据实际保存目录修改。 +本sample以Llama2-7b模型,pileval和wikitext2数据集为示例,请用户自行下载。 ### 1.3 简易量化配置 ./src/quantization.cfg文件为用户自定义的简易量化配置,具体表示信息如下: @@ -22,13 +22,20 @@ ## 2 FLOAT8_E4M3FN量化示例 > 当前quantization.cfg文件中weight_only_config.wts_type设置的值为FLOAT8_E4M3FN,如果需要HIFLOAT8仅权重量化,请适配修改quantization.cfg -> 如果要验证deploy模型,需要设置save_post_quant_model接口中参数mode为'deploy',并将生成的部署模型搬到npu上进行推理 ### 2.1 使用接口方式调用 -请在当前目录执行如下命令运行示例程序,用户需根据实际情况修改示例程序中的模型和数据集路径: +请在当前目录执行如下命令运行示例程序 -`CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 python3 src/run_llama7b_quantization.py` +验证fakequant模型脚本: + +`CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 python3 src/run_llama7b_quantization.py --test_on_npu_flag=false --calibration_data=/pile_val_backup/ --verify_data=/data/Datasets/wikitext/wikitext-2-raw-v1/wikitext-2-raw/wikiscript.py --model=/data/Models/pytorch/Llama2/Llama2_7b_hf` + +验证deploy模型脚本: + +`python3 src/run_llama7b_quantization.py --test_on_npu_flag=true` + +> test_on_npu_flag参数表明是否生成部署模型在npu上推理,calibration_data参数为校准集路径,verify_data为验证集的路径,model为模型存放路径 若出现如下信息,则说明量化成功: diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py index 092238d22..4380377ac 100644 --- a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py +++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py @@ -14,7 +14,7 @@ # limitations under the License. """ - +import argparse import os import copy import time @@ -74,7 +74,14 @@ def build_model_and_enc(model, model_path, gpu_num): return model, enc if __name__ == '__main__': - model, model_path = get_llama2('7b') + parser = argparse.ArgumentParser() + parser.add_argument('--test_on_npu_flag', type=lambda x: (str(x).lower() == 'true')) + parser.add_argument('--calibration_data', type=str, default="/pile_val_backup/") + parser.add_argument('--verify_data', type=str, default="/data/Datasets/wikitext/wikitext-2-raw-v1/wikitext-2-raw/wikiscript.py") + parser.add_argument('--model', type=str, default="/data/Models/pytorch/Llama2/Llama2_7b_hf") + + args = parser.parse_args() + model, model_path = get_llama2(args.model) model = model.eval() copied_model = copy.deepcopy(model) gpu_num = torch.cuda.device_count() @@ -92,7 +99,7 @@ if __name__ == '__main__': # Phase2: do weights calibration and generate calibration model samples = get_calib_dataset( - data="pileval", tokenizer=enc, n_samples=512, block_size=256 + data_path=args.calibration_data, tokenizer=enc, n_samples=512, block_size=256 ) samples = torch.cat(samples, dim=0)[:1,:] model.config.use_cache = False @@ -117,12 +124,18 @@ if __name__ == '__main__': model, enc = build_model_and_enc(copied_model, model_path, gpu_num) # Phase3: save fakequant model - testenc = get_loaders(dataset_name='wikitext2', + testenc = get_loaders(data_path=args.verify_data, enc=enc, seqlen=model.seqlen) testenc = testenc.input_ids.to(model.device) - fake_quant_model = amct.save_post_quant_model(record_file, model, mode='fakequant') + + if args.test_on_npu_flag: + quant_model = amct.save_post_quant_model(record_file, model, mode='deploy') + quant_model = quant_model.npu() + else: + quant_model = amct.save_post_quant_model(record_file, model, mode='fakequant') + nsamples = testenc.numel() // model.seqlen if torch.cuda.is_available(): @@ -133,12 +146,12 @@ if __name__ == '__main__': test_start_time = time.time() for i in tqdm.tqdm(range(nsamples), desc="evaluating..."): batch = testenc[:, (i * model.seqlen) : ((i + 1) * model.seqlen)].to( - model.device + quant_model.device ) with torch.no_grad(): - lm_logits = fake_quant_model(batch).logits - shift_logits = lm_logits[:, :-1, :].contiguous().float() - shift_labels = testenc[:, (i * model.seqlen) : ((i + 1) * model.seqlen)][:, 1:] + lm_logits = quant_model(batch).logits + shift_logits = lm_logits[:, :-1, :].contiguous().float().cpu() + shift_labels = testenc[:, (i * model.seqlen) : ((i + 1) * model.seqlen)][:, 1:].cpu() loss_fct = nn.CrossEntropyLoss() loss = loss_fct( shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1) diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/utils.py b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/utils.py index af20318be..5bea4e1fc 100644 --- a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/utils.py +++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/utils.py @@ -18,7 +18,7 @@ import torch import torch.nn as nn from datasets import load_dataset,load_from_disk -def get_llama2(model, seqlen=2048): +def get_llama2(model_path, seqlen=2048): '''If model is specified from ['7b', '13b', '70b'], then we load official pretrained model; If you want to load checkpoints other than the official ones, please specifiy the model path, otherwise please choose from ['7b', '13b', '70b'] for better clarity @@ -27,36 +27,28 @@ def get_llama2(model, seqlen=2048): def skip(*args, **kwargs): pass - if model in ['7b', '13b', '70b']: - model_path = f'/data/Models/pytorch/Llama2/Llama2_{model}_hf' - print(f'Getting official pretrained Llama2-{model}') - else: - model_path = model torch.nn.init.kaiming_uniform_ = skip torch.nn.init.uniform_ = skip torch.nn.init.normal_ = skip from transformers import LlamaForCausalLM - model = LlamaForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, offload_folder="offload/") + model = LlamaForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16, offload_folder="offload/") model.seqlen = seqlen return model, model_path -def get_loaders(dataset_name: str, enc, seqlen): - if dataset_name == 'wikitext2': - print('Loading dataset: Wikitext2') - testenc = load_dataset('/data/Datasets/wikitext/wikitext-2-raw-v1/wikitext-2-raw/wikiscript.py', 'wikitext-2-raw-v1', split='test', trust_remote_code=True) - testenc = enc("\n\n".join(testenc["text"]), return_tensors="pt") +def get_loaders(data_path: str, enc, seqlen): + + print('Loading dataset: Wikitext2') + testenc = load_dataset(data_path, 'wikitext-2-raw-v1', split='test', trust_remote_code=True) + testenc = enc("\n\n".join(testenc["text"]), return_tensors="pt") return testenc -def get_calib_dataset(data="pileval", tokenizer=None, n_samples=512, block_size=512): - if data == "pileval": - dataset = load_from_disk('/pile_val_backup') - else: - raise NotImplementedError +def get_calib_dataset(data_path, tokenizer=None, n_samples=512, block_size=512): + dataset = load_from_disk(data_path) dataset = dataset.shuffle(seed=42) samples = [] n_run = 0 -- Gitee From 47e489f94bce556a7fc81806b8f07e7b551990a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com> Date: Tue, 29 Apr 2025 11:52:21 +0800 Subject: [PATCH 17/21] fix --- .../amct_pytorch/hif8_fp8_weight_quantization/README_CN.md | 2 +- .../src/run_llama7b_quantization.py | 6 +++--- .../amct_pytorch/hif8_fp8_weight_quantization/src/utils.py | 5 ----- 3 files changed, 4 insertions(+), 9 deletions(-) diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md index 1b575ad98..2c5cc0108 100644 --- a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md +++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md @@ -31,7 +31,7 @@ `CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 python3 src/run_llama7b_quantization.py --test_on_npu_flag=false --calibration_data=/pile_val_backup/ --verify_data=/data/Datasets/wikitext/wikitext-2-raw-v1/wikitext-2-raw/wikiscript.py --model=/data/Models/pytorch/Llama2/Llama2_7b_hf` -验证deploy模型脚本: +验证deploy模型脚本(需要适配npu相关环境): `python3 src/run_llama7b_quantization.py --test_on_npu_flag=true` diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py index 4380377ac..c30010c17 100644 --- a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py +++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py @@ -76,9 +76,9 @@ def build_model_and_enc(model, model_path, gpu_num): if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--test_on_npu_flag', type=lambda x: (str(x).lower() == 'true')) - parser.add_argument('--calibration_data', type=str, default="/pile_val_backup/") - parser.add_argument('--verify_data', type=str, default="/data/Datasets/wikitext/wikitext-2-raw-v1/wikitext-2-raw/wikiscript.py") - parser.add_argument('--model', type=str, default="/data/Models/pytorch/Llama2/Llama2_7b_hf") + parser.add_argument('--calibration_data', type=str, default='/pile_val_backup/') + parser.add_argument('--verify_data', type=str, default='/data/Datasets/wikitext/wikitext-2-raw-v1/wikitext-2-raw/wikiscript.py') + parser.add_argument('--model', type=str, default='/data/Models/pytorch/Llama2/Llama2_7b_hf/') args = parser.parse_args() model, model_path = get_llama2(args.model) diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/utils.py b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/utils.py index 5bea4e1fc..586916fbd 100644 --- a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/utils.py +++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/utils.py @@ -19,11 +19,6 @@ import torch.nn as nn from datasets import load_dataset,load_from_disk def get_llama2(model_path, seqlen=2048): - '''If model is specified from ['7b', '13b', '70b'], then we load official pretrained model; - If you want to load checkpoints other than the official ones, please specifiy the model path, - otherwise please choose from ['7b', '13b', '70b'] for better clarity - ''' - def skip(*args, **kwargs): pass -- Gitee From 7ac2ce67740b2229a7be300d69116c892a1640f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com> Date: Tue, 29 Apr 2025 11:55:58 +0800 Subject: [PATCH 18/21] fix --- .../src/run_llama7b_quantization.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py index c30010c17..c11e09fc6 100644 --- a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py +++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py @@ -76,9 +76,9 @@ def build_model_and_enc(model, model_path, gpu_num): if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--test_on_npu_flag', type=lambda x: (str(x).lower() == 'true')) - parser.add_argument('--calibration_data', type=str, default='/pile_val_backup/') + parser.add_argument('--calibration_data', type=str, default='/pile_val_backup') parser.add_argument('--verify_data', type=str, default='/data/Datasets/wikitext/wikitext-2-raw-v1/wikitext-2-raw/wikiscript.py') - parser.add_argument('--model', type=str, default='/data/Models/pytorch/Llama2/Llama2_7b_hf/') + parser.add_argument('--model', type=str, default='/data/Models/pytorch/Llama2/Llama2_7b_hf') args = parser.parse_args() model, model_path = get_llama2(args.model) -- Gitee From 16180368e0e692db2c38fe6bed50ee21917a0087 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com> Date: Tue, 29 Apr 2025 14:31:00 +0800 Subject: [PATCH 19/21] fix --- .../src/run_llama7b_quantization.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py index c11e09fc6..2b2f14603 100644 --- a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py +++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py @@ -102,17 +102,17 @@ if __name__ == '__main__': data_path=args.calibration_data, tokenizer=enc, n_samples=512, block_size=256 ) samples = torch.cat(samples, dim=0)[:1,:] - model.config.use_cache = False + post_quant_model = amct.create_post_quant_model(config_file, record_file, model) if torch.cuda.is_available(): torch.cuda.empty_cache() - post_quant_model.config.use_cache = False + with torch.no_grad(): post_quant_model(samples.to(next(post_quant_model.parameters()).device)) - if torch.cuda.is_available(): - torch.cuda.empty_cache() + if torch.cuda.is_available(): + torch.cuda.empty_cache() test_end_time = time.time() total_time = test_end_time - test_start_time print('Calibration time taken: ', total_time // 60, 'min ', total_time%60, 's') -- Gitee From 11a4afa09abbfa07071b87cbf322b9aad7100582 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com> Date: Mon, 23 Jun 2025 20:07:53 +0800 Subject: [PATCH 20/21] add fp4 --- .../fp4_weight_quantization/README_CN.md | 50 ++++++ .../fp4_weight_quantization/requirements.txt | 7 + .../src/quantization.cfg | 8 + .../src/run_llama7b_quantization.py | 162 ++++++++++++++++++ .../fp4_weight_quantization/src/utils.py | 69 ++++++++ 5 files changed, 296 insertions(+) create mode 100644 python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/README_CN.md create mode 100644 python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/requirements.txt create mode 100644 python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/quantization.cfg create mode 100644 python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/run_llama7b_quantization.py create mode 100644 python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/utils.py diff --git a/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/README_CN.md b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/README_CN.md new file mode 100644 index 000000000..8482006cd --- /dev/null +++ b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/README_CN.md @@ -0,0 +1,50 @@ +# FP4伪量化 + +## 1 FP4伪量化 + +### 1.1 安装依赖 + +本sample依赖包可参考[requirements.txt](requirements.txt) + +### 1.2 模型和数据集准备 + +本sample以Llama2-7b模型,pileval和wikitext2数据集为示例,请用户自行下载。 + +### 1.3 简易量化配置 +./src/quantization.cfg文件为用户自定义的简易量化配置,具体表示信息如下: + +| 字段 |类型| 说明 | 默认值 | 取值范围 | +|:--| :-: | :-- | :-: | :-: | +|skip_layers|str|跳过量化的层 |/|/| +|weight_only_config.weight_compress_only|bool|是否为仅权重量化|False|True/False| +|weight_only_config.wts_type|enum|量化后权重类型|INT8|INT8/MXFP4_E2M1/HIFLOAT8/FLOAT8_E4M3FN| +|weight_only_config.awq_quantize.grids_num|int|awq搜索格点数量|20|/|/| + +## 2 FLOAT4_E2M1量化示例 +> 当前quantization.cfg文件中weight_only_config.wts_type设置的值为FLOAT4_E2M1 + + +### 2.1 使用接口方式调用 + +请在当前目录执行如下命令运行示例程序 + +验证fakequant模型脚本: + +`CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 python3 src/run_llama7b_quantization.py --calibration_data=/pile_val_backup/ --verify_data=/data/Datasets/wikitext/wikitext-2-raw-v1/wikitext-2-raw/wikiscript.py --model=/data/Models/pytorch/Llama2/Llama2_7b_hf` + + +若出现如下信息,则说明量化成功: + +```none +Test time taken: 9.0 min 38.24865388870239 s +Score: 5.657759 +``` + +推理成功后,在当前目录会生成量化日志文件./amct_log/amct_pytorch.log和./output文件夹,该文件夹内包含以下内容: + +- config.json:量化配置文件,描述了如何对模型中的每一层进行量化。 +- record.txt:量化因子记录文件。 +- awq_result.pt:存储了awq算法的的scale和clip +- quant_factor.pt:存储量化缩放因子 + +> 如果outputs目录下已经存在量化配置文件或量化因子记录文件,再次运行示例程序时,如果新生成的文件与已有文件同名,则会覆盖已有的量化配置文件或量化因子记录文件。 diff --git a/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/requirements.txt b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/requirements.txt new file mode 100644 index 000000000..55441d062 --- /dev/null +++ b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/requirements.txt @@ -0,0 +1,7 @@ +torch==2.1.0 +transformers==4.40.0 +accelerate==0.30.1 +datasets==2.19.1 +sentencepiece==0.2.0 +numpy==1.23.5 +protobuf==3.20.2 \ No newline at end of file diff --git a/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/quantization.cfg b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/quantization.cfg new file mode 100644 index 000000000..a43152ad3 --- /dev/null +++ b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/quantization.cfg @@ -0,0 +1,8 @@ +skip_layers: "lm_head" +weight_only_config: { + weight_compress_only: True + wts_type: FLOAT4_E2M1 + awq_quantize:{ + grids_num: 20 + } +} \ No newline at end of file diff --git a/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/run_llama7b_quantization.py b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/run_llama7b_quantization.py new file mode 100644 index 000000000..76d164c0f --- /dev/null +++ b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/run_llama7b_quantization.py @@ -0,0 +1,162 @@ +""" +# Copyright 2025 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +import argparse +import os +import copy +import time +import tqdm +import torch +import torch.nn as nn +from transformers import AutoTokenizer, AutoConfig +from accelerate import infer_auto_device_map, dispatch_model +from accelerate.utils.modeling import get_balanced_memory + +from utils import get_loaders, get_llama2, get_calib_dataset +import amct_pytorch as amct + + +def build_model_and_enc(model, model_path, gpu_num): + config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) + if "mpt" in config.__class__.__name__.lower(): + enc = AutoTokenizer.from_pretrained( + config.tokenizer_name, trust_remote_code=True + ) + else: + enc = AutoTokenizer.from_pretrained( + model_path, use_fast=False, trust_remote_code=True + ) + + # Move the model to GPU (as much as possible) for LM evaluation + # max_memory = ['0:16GiB', '1:16GiB','2:16GiB', 'cpu:30GiB'], '0' means the first GPU that you specify. + # I don't recommend use 16GiB, we need to reserve some space for other tensors during calculation + # please see the recommand memeory allocation in the Word file + # Adjust the max_size accroding to the real situation + # a clever way: + + max_memory = [] + for i in range(gpu_num): + max_memory.append(f'{i}:12GiB') + max_memory.append('cpu:80GiB') + print('Max_memory allocation: \n', max_memory) + + max_memory = [v.split(":") for v in (max_memory or [])] + max_memory = {(int(k) if k.isdigit() else k): v for k, v in max_memory} + kwargs = { + "max_memory": get_balanced_memory( + model, max_memory if len(max_memory) > 0 else None + ) + } + model.tie_weights() + device_map = infer_auto_device_map( + model, + no_split_module_classes=[ + "LlamaDecoderLayer", + ], + **kwargs, + ) + model = dispatch_model(model, device_map=device_map, + offload_dir=os.path.join(model_path, 'offload_dir')) + + return model, enc + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--calibration_data', type=str, default='/pile_val_backup') + parser.add_argument('--verify_data', type=str, default='/data/Datasets/wikitext/wikitext-2-raw-v1/wikitext-2-raw/wikiscript.py') + parser.add_argument('--model', type=str, default='/data/Models/pytorch/Llama2/Llama2_7b_hf') + + args = parser.parse_args() + model, model_path = get_llama2(args.model) + model = model.eval() + copied_model = copy.deepcopy(model) + gpu_num = torch.cuda.device_count() + model, enc = build_model_and_enc(model, model_path, gpu_num) + + proto_path = './src/quantization.cfg' + config_file = './output/config.json' + record_file = './output/record.txt' + + test_start_time = time.time() + # Phase1: generate quant config json + amct.create_post_quant_config(config_file, + model, + config_defination=proto_path) + + # Phase2: do weights calibration and generate calibration model + samples = get_calib_dataset( + data_path=args.calibration_data, tokenizer=enc, n_samples=512, block_size=256 + ) + samples = torch.cat(samples, dim=0)[:1,:] + + post_quant_model = amct.create_post_quant_model(config_file, + record_file, + model) + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + with torch.no_grad(): + post_quant_model(samples.to(next(post_quant_model.parameters()).device)) + if torch.cuda.is_available(): + torch.cuda.empty_cache() + test_end_time = time.time() + total_time = test_end_time - test_start_time + print('Calibration time taken: ', total_time // 60, 'min ', total_time%60, 's') + # save memory, del unuse model + del post_quant_model + + if torch.cuda.is_available(): + torch.cuda.empty_cache() + model, enc = build_model_and_enc(copied_model, model_path, gpu_num) + + # Phase3: save fakequant model + testenc = get_loaders(data_path=args.verify_data, + enc=enc, + seqlen=model.seqlen) + + testenc = testenc.input_ids.to(model.device) + + quant_model = amct.save_post_quant_model(record_file, model, mode='fakequant') + + nsamples = testenc.numel() // model.seqlen + + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + # Phase4: Test ppl result + nlls = [] + test_start_time = time.time() + for i in tqdm.tqdm(range(nsamples), desc="evaluating..."): + batch = testenc[:, (i * model.seqlen) : ((i + 1) * model.seqlen)].to( + quant_model.device + ) + with torch.no_grad(): + lm_logits = quant_model(batch).logits + shift_logits = lm_logits[:, :-1, :].contiguous().float().cpu() + shift_labels = testenc[:, (i * model.seqlen) : ((i + 1) * model.seqlen)][:, 1:].cpu() + loss_fct = nn.CrossEntropyLoss() + loss = loss_fct( + shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1) + ) + neg_log_likelihood = loss.float() * model.seqlen + nlls.append(neg_log_likelihood) + test_end_time = time.time() + + ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen)) + + total_time = test_end_time - test_start_time + print('Test time taken: ', total_time // 60, 'min ', total_time%60, 's' ) + print('Score: ', ppl.item()) \ No newline at end of file diff --git a/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/utils.py b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/utils.py new file mode 100644 index 000000000..586916fbd --- /dev/null +++ b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/utils.py @@ -0,0 +1,69 @@ +""" +# Copyright 2025 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +import torch +import torch.nn as nn +from datasets import load_dataset,load_from_disk + +def get_llama2(model_path, seqlen=2048): + def skip(*args, **kwargs): + pass + + torch.nn.init.kaiming_uniform_ = skip + torch.nn.init.uniform_ = skip + torch.nn.init.normal_ = skip + from transformers import LlamaForCausalLM + + model = LlamaForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16, offload_folder="offload/") + + model.seqlen = seqlen + return model, model_path + + +def get_loaders(data_path: str, enc, seqlen): + + print('Loading dataset: Wikitext2') + testenc = load_dataset(data_path, 'wikitext-2-raw-v1', split='test', trust_remote_code=True) + testenc = enc("\n\n".join(testenc["text"]), return_tensors="pt") + + return testenc + + +def get_calib_dataset(data_path, tokenizer=None, n_samples=512, block_size=512): + dataset = load_from_disk(data_path) + dataset = dataset.shuffle(seed=42) + samples = [] + n_run = 0 + for data in dataset: + line = data["text"] + line = line.strip() + line_encoded = tokenizer.encode(line) + if len(line_encoded) > 512: + continue + sample = torch.tensor([line_encoded]) + if sample.numel() == 0: + continue + samples.append(sample) + n_run += 1 + if n_run == n_samples: + break + # now concatenate all samples and split according to block size + cat_samples = torch.cat(samples, dim=1) + n_split = cat_samples.shape[1] // block_size + print(f" * Split into {n_split} blocks") + return [ + cat_samples[:, i * block_size : (i + 1) * block_size] for i in range(n_split) + ] -- Gitee From 8ab0d35be3876ccead6bfb06e5a43b08fa5167fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com> Date: Wed, 25 Jun 2025 10:28:26 +0800 Subject: [PATCH 21/21] fix --- .../9_amct/amct_pytorch/fp4_weight_quantization/README_CN.md | 2 +- .../fp4_weight_quantization/src/run_llama7b_quantization.py | 2 +- .../9_amct/amct_pytorch/fp4_weight_quantization/src/utils.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/README_CN.md b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/README_CN.md index 8482006cd..93ea0a9ce 100644 --- a/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/README_CN.md +++ b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/README_CN.md @@ -18,7 +18,7 @@ |skip_layers|str|跳过量化的层 |/|/| |weight_only_config.weight_compress_only|bool|是否为仅权重量化|False|True/False| |weight_only_config.wts_type|enum|量化后权重类型|INT8|INT8/MXFP4_E2M1/HIFLOAT8/FLOAT8_E4M3FN| -|weight_only_config.awq_quantize.grids_num|int|awq搜索格点数量|20|/|/| +|weight_only_config.awq_quantize.grids_num|uint32|awq搜索格点数量|20|/|/| ## 2 FLOAT4_E2M1量化示例 > 当前quantization.cfg文件中weight_only_config.wts_type设置的值为FLOAT4_E2M1 diff --git a/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/run_llama7b_quantization.py b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/run_llama7b_quantization.py index 76d164c0f..4aac4fad9 100644 --- a/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/run_llama7b_quantization.py +++ b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/run_llama7b_quantization.py @@ -98,7 +98,7 @@ if __name__ == '__main__': # Phase2: do weights calibration and generate calibration model samples = get_calib_dataset( - data_path=args.calibration_data, tokenizer=enc, n_samples=512, block_size=256 + data_path=args.calibration_data, tokenizer=enc, n_samples=512, block_size=518 ) samples = torch.cat(samples, dim=0)[:1,:] diff --git a/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/utils.py b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/utils.py index 586916fbd..474a5b618 100644 --- a/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/utils.py +++ b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/utils.py @@ -27,7 +27,7 @@ def get_llama2(model_path, seqlen=2048): torch.nn.init.normal_ = skip from transformers import LlamaForCausalLM - model = LlamaForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16, offload_folder="offload/") + model = LlamaForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, offload_folder="offload/") model.seqlen = seqlen return model, model_path -- Gitee