diff --git a/MindIE/MindIE-Torch/built-in/foundation/stable_diffusion/attention_processor.patch b/MindIE/MindIE-Torch/built-in/foundation/stable_diffusion/attention_processor.patch
index 26f296526adcfaf629f3c47a311b88bb4aa002a2..bd15281c5a3acf9752eec8a239323f66f1beadb7 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/stable_diffusion/attention_processor.patch
+++ b/MindIE/MindIE-Torch/built-in/foundation/stable_diffusion/attention_processor.patch
@@ -1,5 +1,5 @@
---- attention_processor.py	2024-02-22 19:06:56.596000000 +0800
-+++ attention_processor.py	2024-02-22 19:07:17.232000000 +0800
+--- attention_processor.py	2024-07-02 07:42:32.312000000 +0000
++++ attention_processor.py	2024-07-02 07:44:55.100000000 +0000
 @@ -205,10 +205,11 @@
          # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
          # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
@@ -14,5 +14,5 @@
 +        #     )
 +        processor = AttnProcessor()
          self.set_processor(processor)
-
-     def set_use_memory_efficient_attention_xformers(
\ No newline at end of file
+ 
+     def set_use_memory_efficient_attention_xformers(
diff --git a/MindIE/MindIE-Torch/built-in/foundation/stable_diffusion/stable_diffusion_pipeline.py b/MindIE/MindIE-Torch/built-in/foundation/stable_diffusion/stable_diffusion_pipeline.py
index e1ffc60b1010081568415ddfabba36cabc9c4612..a953ae48058562007619f4b8b032e42ad9898754 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/stable_diffusion/stable_diffusion_pipeline.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/stable_diffusion/stable_diffusion_pipeline.py
@@ -962,7 +962,8 @@ def main():
                 flag_cache=flag_cache,
             )
 
-        use_time += time.time() - start_time
+        if i > 4: # do not count the time spent inferring the first 0 to 4 images
+            use_time += time.time() - start_time
 
         for j in range(n_prompts):
             image_save_path = os.path.join(save_dir, f"{save_names[j]}.png")
@@ -975,6 +976,7 @@ def main():
 
             image_info[-1]['images'].append(image_save_path)
 
+    infer_num = infer_num - 5 # do not count the time spent inferring the first 5 images
     print(f"[info] infer number: {infer_num}; use time: {use_time:.3f}s\n"
           f"average time: {use_time / infer_num:.3f}s\n"
           f"clip time: {clip_time / infer_num:.3f}s\n"
diff --git a/MindIE/MindIE-Torch/built-in/foundation/stable_diffusion/stable_diffusion_pipeline_parallel.py b/MindIE/MindIE-Torch/built-in/foundation/stable_diffusion/stable_diffusion_pipeline_parallel.py
index e98b6e8e393ba21d72b84a7e93784dbfb7c625fd..76c7e606cafc1dac1bab12b83155c269a2fec6d7 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/stable_diffusion/stable_diffusion_pipeline_parallel.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/stable_diffusion/stable_diffusion_pipeline_parallel.py
@@ -1050,7 +1050,8 @@ def main():
                 flag_cache=flag_cache,
             )
 
-        use_time += time.time() - start_time
+        if i > 4: # do not count the time spent inferring the first 0 to 4 images
+            use_time += time.time() - start_time
 
         for j in range(n_prompts):
             image_save_path = os.path.join(save_dir, f"{save_names[j]}.png")
@@ -1063,6 +1064,7 @@ def main():
 
             image_info[-1]['images'].append(image_save_path)
 
+    infer_num = infer_num - 5 # do not count the time spent inferring the first 5 images
     print(f"[info] infer number: {infer_num}; use time: {use_time:.3f}s\n"
           f"average time: {use_time / infer_num:.3f}s\n"
           f"clip time: {clip_time / infer_num:.3f}s\n"
diff --git a/MindIE/MindIE-Torch/built-in/foundation/stable_diffusion_xl/README.md b/MindIE/MindIE-Torch/built-in/foundation/stable_diffusion_xl/README.md
index 1865dac3912bed80b7fa5e498283493de3dfd71f..7396145eb4326bd71328693a70d281f63b3a928d 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/stable_diffusion_xl/README.md
+++ b/MindIE/MindIE-Torch/built-in/foundation/stable_diffusion_xl/README.md
@@ -62,7 +62,7 @@
    pip3 install -r requirements.txt
    ```
 
-2. 安装mindie和mindietorch包
+2. 安装mindie包
 
    ```bash
    # 安装mindie
@@ -133,20 +133,47 @@
       ```
       参数说明：
       - --model：模型权重路径
-      - --output_dir: ONNX模型输出目录
-      - --use_cache: 【可选】在推理过程中使用cache
-      - --parallel: 【可选】导出适用于并行方案的模型，当前仅带unetCache优化时，支持并行
-      - --batch_size: 设置batch_size, 默认值为1,当前仅支持batch_size=1的场景
+      - --output_dir: 存放导出模型的路径
+      - --use_cache: 【可选】推荐在推理过程中使用unetCache策略
+      - --parallel: 【可选】导出适用于并行方案的模型, 当前仅带unetCache优化时，支持并行
+      - --batch_size: 设置batch_size, 默认值为1, 当前仅支持batch_size=1的场景
       - --flag：默认为0。0代表静态，只支持分辨率为1024x1024；1代表动态分档，支持的分辨率为1024x1024和512x512；2代表动态shape，height的范围为[512, 1024]，width的范围是[512, 1664]。
       - --soc：只支持Duo和A2。默认为A2。A2特指910B4。
       - --device：推理设备ID
    
 2. 开始推理验证。
 
-   1. 执行推理脚本。
+   1. 开启cpu高性能模式
+      ```bash
+      echo performance |tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
+      sysctl -w vm.swappiness=0
+      sysctl -w kernel.numa_balancing=0
+      ```
+
+   2. 安装绑核工具
+      ```bash
+      apt-get update
+      apt-get install numactl
+      ```
+      查询卡的NUMA node
+      ```shell
+      lspci -vs bus-id
+      ```
+      bus-id可通过npu-smi info获得，查询到NUMA node，在推理命令前加上对应的数字
+
+      可通过lscpu获得NUMA node对应的CPU核数
+      ```shell
+      NUMA node0: 0-23
+      NUMA node1: 24-47
+      NUMA node2: 48-71
+      NUMA node3: 72-95
+      ```
+      当前查到NUMA node是0，对应0-23，推荐绑定其中单核以获得更好的性能。
+   
+   3. 执行推理脚本。
       ```bash
       # 不使用unetCache策略
-      python3 stable_diffusionxl_pipeline.py \
+      numactl -C 0-23 python3 stable_diffusionxl_pipeline.py \
               --model ${model_base} \
               --prompt_file ./prompts.txt \
               --device 0 \
@@ -158,7 +185,7 @@
               --width 1024
       
       # 使用UnetCache策略
-      python3 stable_diffusionxl_pipeline.py \
+      numactl -C 0-23 python3 stable_diffusionxl_pipeline.py \
               --model ${model_base} \
               --prompt_file ./prompts.txt \
               --device 0 \
@@ -171,7 +198,7 @@
               --width 1024
       
       # 使用UnetCache策略,同时使用双卡并行策略
-      python3 stable_diffusionxl_pipeline_cache_parallel.py \
+      numactl -C 0-23 python3 stable_diffusionxl_pipeline_cache_parallel.py \
               --model ${model_base} \
               --prompt_file ./prompts.txt \
               --device 0,1 \
@@ -185,34 +212,21 @@
       ```
       
       参数说明：
-      - --model：模型名称或本地模型目录的路径。
+      - --model：模型权重路径。
       - --output_dir：存放导出模型的目录。
       - --prompt_file：提示词文件。
       - --save_dir：生成图片的存放目录。
       - --batch_size：模型batch size。
       - --steps：生成图片迭代次数。
       - --device：推理设备ID；可用逗号分割传入两个设备ID，此时会使用并行方式进行推理。
-      - --use_cache: 【可选】在推理过程中使用cache。
-      - --cache_steps: 使用cache的迭代次数，迭代次数越多性能越好，但次数过多可能会导致精度下降。
+      - --use_cache: 【可选】推荐在推理过程中使用unetCache策略。
       - --flag：默认为0。0代表静态，只支持分辨率为1024x1024；1代表动态分档，支持的分辨率为1024x1024和512x512；2代表动态shape，height的范围为[512, 1024]，width的范围是[512, 1664]。**注意**：请与导出模型时设置的flag保持一致
       - --height：与flag标志位对应的height一致
       - --width：与flag标志位对应的width一致
       
-      不带unetCache策略，执行完成后在`./results`目录下生成推理图片。并在终端显示推理时间，参考如下：
-   
-      ```
-      [info] infer number: 16; use time: 150.567s; average time: 9.410s
-      ```
-      
-      带unetCache策略，执行完成后在`./results_unetCache`目录下生成推理图片。并在终端显示推理时间，参考如下：
-      ```
-      [info] infer number: 16; use time: 71.855s; average time: 4.491s
-      ```
-      
-      带unetCache策略，同时使用双卡并行策略，执行完成后在`./results_unetCache_parallel`目录下生成推理图片。并在终端显示推理时间，参考如下：
-      ```
-      [info] infer number: 16; use time: 47.351s; average time: 2.959s
-      ```
+      不带unetCache策略，执行完成后在`./results`目录下生成推理图片，在当前目录生成一个`image_info.json`文件，记录着图片和prompt的对应关系，并在终端显示推理时间。
+      带unetCache策略，执行完成后在`./results_unetCache`目录下生成推理图片，在当前目录生成一个`image_info.json`文件，记录着图片和prompt的对应关系。并在终端显示推理时间。
+      带unetCache策略，同时使用双卡并行策略，执行完成后在`./results_unetCache_parallel`目录下生成推理图片，在当前目录生成一个`image_info.json`文件，记录着图片和prompt的对应关系。并在终端显示推理时间。
 
 ## 精度验证<a name="section741711594518"></a>
 
@@ -289,7 +303,7 @@
       ```
 
       参数说明：
-      - --model：模型名称或本地模型目录的路径。
+      - --model：模型权重路径。
       - --output_dir：存放导出模型的目录。
       - --prompt_file：提示词文件。
       - --prompt_file_type: prompt文件类型，用于指定读取方式，可选plain，parti，hpsv2。
@@ -300,9 +314,9 @@
       - --steps：生成图片迭代次数。
       - --device：推理设备ID；可用逗号分割传入两个设备ID，此时会使用并行方式进行推理。
 
-      不带unetCache，执行完成后会在`./results_PartiPrompts`目录下生成推理图片，并且会在当前目录生成一个`image_info.json`文件，记录着图片和prompt的对应关系。
-      带unetCache，执行完成后会在`./results_PartiPrompts_unetCache`目录下生成推理图片，并且会在当前目录生成一个`image_info.json`文件，记录着图片和prompt的对应关系。
-      带unetCache，同时使用双卡并行策略，执行完成后会在`./results_PartiPrompts_unetCache_parallel`目录下生成推理图片，并且会在当前目录生成一个`image_info.json`文件，记录着图片和prompt的对应关系。
+      不带unetCache策略，执行完成后在`./results_PartiPrompts`目录下生成推理图片，在当前目录生成一个`image_info.json`文件，记录着图片和prompt的对应关系，并在终端显示推理时间。
+      带unetCache策略，执行完成后在`./results_PartiPrompts_unetCache`目录下生成推理图片，在当前目录生成一个`image_info.json`文件，记录着图片和prompt的对应关系。并在终端显示推理时间。
+      带unetCache策略，同时使用双卡并行策略，执行完成后在`./results_PartiPrompts_unetCache_parallel`目录下生成推理图片，在当前目录生成一个`image_info.json`文件，记录着图片和prompt的对应关系。并在终端显示推理时间。
 
    4. 计算CLIP-score
 
@@ -325,16 +339,16 @@
 
 ## 量化功能【可选】<a name="section741711594518"></a>
 
-若使用W8A8量化功能，分辨率只支持1024x1024和512x512：
+可使用W8A8量化功能提升性能，但可能导致精度下降。默认batch_size为1，默认分辨率为1024x1024，可支持batch_size为2、分辨率为512x512的场景（修改第4. 5.步参数即可）
 
-   1. 导出模型，height只支持1024和512，width只支持1024和512
+   1. 导出浮点pt模型并进行编译。
 
       ```bash
       # 使用unetCache, 非并行
-      python3 export_ts.py --model ${model_base} --output_dir ./models --use_cache --batch_size 1 --flag 0 --soc A2 --device 0 --height 1024 --width 1024
+      python3 export_ts.py --model ${model_base} --output_dir ./models --use_cache --flag 0 --soc A2 --device 0
       
       # 不使用unetCache, 非并行
-      python3 export_ts.py --model ${model_base} --output_dir ./models --batch_size 1 --flag 0 --soc A2 --device 0 --height 1024 --width 1024
+      python3 export_ts.py --model ${model_base} --output_dir ./models --flag 0 --soc A2 --device 0
       ```
 
    2. 量化编译。./quant/build.sh中的TorchPath需要指定为python安装torch的路径。
@@ -346,7 +360,7 @@
       bash build.sh
       ```
 
-   3. 导出unet pt模型的输入。
+   3. 导出浮点unet模型的输入。执行完毕后会在当前路径下生成unet_data.npy文件。
 
       执行命令：
 
@@ -361,8 +375,6 @@
               --output_dir ./models \
               --use_cache \
               --flag 0 \
-              --height 1024 \
-              --width 1024 \
               --save_unet_input
       # 若不使用UnetCache策略
       python3 stable_diffusionxl_pipeline.py \
@@ -373,12 +385,10 @@
               --steps 50 \
               --output_dir ./models \
               --flag 0 \
-              --height 1024 \
-              --width 1024 \
               --save_unet_input
       ```
 
-   4. 导出pt模型并进行编译。
+   4. 导出量化pt模型并进行编译。
 
       执行命令：
 
@@ -391,9 +401,11 @@
       ```
 
       参数说明：
-      - --batch_size：设置batch_size, 默认值为1, 可支持batch_size=2的场景
-      - --height：默认分辨率为1024x1024，可支持512x512的场景（性能受影响）
-      - --width：默认分辨率为1024x1024，可支持512x512的场景（性能受影响）
+      - --model：模型权重路径
+      - --output_dir：存放导出模型的目录，执行完成后在`./models_quant`目录下生成量化模型。
+      - --batch_size：默认batch_size为1（可支持batch_size=2的场景, 性能受影响）
+      - --height：默认分辨率为1024x1024（可支持512x512的场景, 性能受影响）
+      - --width：默认分辨率为1024x1024（可支持512x512的场景, 性能受影响）
 
    5. 开始推理验证。
 
@@ -401,7 +413,7 @@
 
       ```bash
       # 使用UnetCache策略，且非并行
-      python3 stable_diffusionxl_pipeline.py \
+      numactl -C 0-23 python3 stable_diffusionxl_pipeline.py \
               --model ${model_base} \
               --prompt_file ./prompts.txt \
               --device 0 \
@@ -414,8 +426,9 @@
               --height 1024 \
               --width 1024 \
               --quant
+
       # 不使用UnetCache策略，且非并行
-      python3 stable_diffusionxl_pipeline.py \
+      numactl -C 0-23 python3 stable_diffusionxl_pipeline.py \
               --model ${model_base} \
               --prompt_file ./prompts.txt \
               --device 0 \
@@ -428,6 +441,8 @@
               --width 1024 \
               --quant
       ```
+      
+      执行完成后在`./results_quant`目录下生成推理图片，在当前目录生成一个`image_info.json`文件，记录着图片和prompt的对应关系。并在终端显示推理时间。
 
 
 # 模型推理性能&精度<a name="ZH-CN_TOPIC_0000001172201573"></a>
@@ -436,27 +451,9 @@
 
 ### StableDiffusionxl
 
-| 硬件形态 | 迭代次数 | 平均耗时    | cpu规格 |
-| :------: | :--: | :--------: | :--------: |
-| A2  |    50  |  6.542s   | 64核(arm) |
-
-性能测试需要独占npu和cpu
+| 硬件形态  | cpu规格 | batch size | 迭代次数 | 优化手段 | 平均耗时 | 精度  | 采样器 |
+| :------: | :------: | :------: | :------: | :------: | :------: | :------: |
+| A2  | 64核(arm) |  1  |  50  | with UnetCache, w/o 量化 |  4s   | clip score 0.376 | ddim |
+| A2  | 64核(arm) |  1  |  50  | with UnetCache, with 量化 |  3.6s   | clip score 0.371 | ddim |
 
-迭代50次的参考精度结果如下：
-
-   ```
-   average score: 0.378
-   category average scores:
-   [Abstract], average score: 0.265
-   [Vehicles], average score: 0.380
-   [Illustrations], average score: 0.372
-   [Arts], average score: 0.414
-   [World Knowledge], average score: 0.391
-   [People], average score: 0.379
-   [Animals], average score: 0.390
-   [Artifacts], average score: 0.373
-   [Food & Beverage], average score: 0.372
-   [Produce & Plants], average score: 0.370
-   [Outdoor Scenes], average score: 0.373
-   [Indoor Scenes], average score: 0.389
-   ```
\ No newline at end of file
+性能测试需要独占npu和cpu
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/foundation/stable_diffusion_xl/attention_processor.patch b/MindIE/MindIE-Torch/built-in/foundation/stable_diffusion_xl/attention_processor.patch
index 26f296526adcfaf629f3c47a311b88bb4aa002a2..bd15281c5a3acf9752eec8a239323f66f1beadb7 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/stable_diffusion_xl/attention_processor.patch
+++ b/MindIE/MindIE-Torch/built-in/foundation/stable_diffusion_xl/attention_processor.patch
@@ -1,5 +1,5 @@
---- attention_processor.py	2024-02-22 19:06:56.596000000 +0800
-+++ attention_processor.py	2024-02-22 19:07:17.232000000 +0800
+--- attention_processor.py	2024-07-02 07:42:32.312000000 +0000
++++ attention_processor.py	2024-07-02 07:44:55.100000000 +0000
 @@ -205,10 +205,11 @@
          # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
          # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
@@ -14,5 +14,5 @@
 +        #     )
 +        processor = AttnProcessor()
          self.set_processor(processor)
-
-     def set_use_memory_efficient_attention_xformers(
\ No newline at end of file
+ 
+     def set_use_memory_efficient_attention_xformers(
diff --git a/MindIE/MindIE-Torch/built-in/foundation/stable_diffusion_xl/export_ts_quant.py b/MindIE/MindIE-Torch/built-in/foundation/stable_diffusion_xl/export_ts_quant.py
index 0f91415f6dc0eca464444d54ec0ddc22cfa3e2d0..c600937c7fd08f63bdea47bef1f854eb039af841 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/stable_diffusion_xl/export_ts_quant.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/stable_diffusion_xl/export_ts_quant.py
@@ -34,7 +34,7 @@ def parse_arguments() -> Namespace:
         "-o",
         "--output_dir",
         type=str,
-        default="./models",
+        default="./models_quant",
         help="Path of directory to save pt models.",
     )
     parser.add_argument(
@@ -268,16 +268,17 @@ def export_ddim_parallel(sd_pipeline, args):
             mindietorch.Input((1,), dtype=mindietorch.dtype.INT64)]
         compile_ddim(model, inputs, scheduler_compiled_path, soc_version)
 
-def trace_quant_model(model, calib_datas, input_shape, pt_path):
-    save_path = pt_path[:-3]
+def trace_quant_model(model, calib_datas, input_shape, pt_path, need_calib=True):
+    save_path = os.path.dirname(os.path.split(pt_path)[0])
     quant_model = copy.deepcopy(model)
     export_model = copy.deepcopy(model)
-    quant_config = QuantConfig(disable_names=[],
-                               amp_num=0, input_shape=input_shape,
-                               act_method=0, quant_mode=0, a_signed=True)
-    calibrator = Calibrator(quant_model, quant_config, calib_data=calib_datas)
-    calibrator.run()
-    calibrator.export_param(os.path.join(save_path, 'quant_weights'))
+    if need_calib:
+        quant_config = QuantConfig(disable_names=[],
+                                   amp_num=0, input_shape=input_shape,
+                                   act_method=0, quant_mode=0, a_signed=True, sigma=40)
+        calibrator = Calibrator(quant_model, quant_config, calib_data=calib_datas)
+        calibrator.run()
+        calibrator.export_param(os.path.join(save_path, 'quant_weights'))
     input_scale = np.load(os.path.join(save_path, 'quant_weights', 'input_scale.npy'), allow_pickle=True).item()
     input_offset = np.load(os.path.join(save_path, 'quant_weights', 'input_offset.npy'), allow_pickle=True).item()
     weight_scale = np.load(os.path.join(save_path, 'quant_weights', 'weight_scale.npy'), allow_pickle=True).item()
@@ -315,7 +316,7 @@ def export_unet_cache(sd_pipeline, args, input_data):
         calib_datas = [list(input_data['cache'])]
         unet = UnetExport(unet_model)
         unet.eval()
-        trace_quant_model(unet, calib_datas, [batch_size, in_channels, sample_size, sample_size], unet_pt_path)
+        trace_quant_model(unet, calib_datas, [batch_size, in_channels, sample_size, sample_size], unet_pt_path, need_calib=True)
     # compile
     batch_size = args.batch_size * 2
     unet_compiled_path = os.path.join(unet_path, f"unet_bs{batch_size}_{parallel}compile_0_quant_{args.height}x{args.width}.ts")
@@ -361,7 +362,7 @@ def export_unet_skip(sd_pipeline, args, input_data):
         calib_datas = [list(input_data['skip'])]
         unet = UnetExport(unet_model)
         unet.eval()
-        trace_quant_model(unet, calib_datas, [batch_size, in_channels, sample_size, sample_size], unet_pt_path)
+        trace_quant_model(unet, calib_datas, [batch_size, in_channels, sample_size, sample_size], unet_pt_path, need_calib=False)
     # compile
     batch_size = args.batch_size * 2
     unet_compiled_path = os.path.join(unet_path, f"unet_bs{batch_size}_{parallel}compile_1_quant_{args.height}x{args.width}.ts")
@@ -405,7 +406,7 @@ def export_unet_init(sd_pipeline, args, input_data):
         calib_datas = [list(input_data['no_cache'])]
         unet = UnetExportInit(unet_model)
         unet.eval()
-        trace_quant_model(unet, calib_datas, [batch_size, in_channels, sample_size, sample_size], unet_pt_path)
+        trace_quant_model(unet, calib_datas, [batch_size, in_channels, sample_size, sample_size], unet_pt_path, need_calib=True)
     # compile
     batch_size = args.batch_size * 2
     unet_compiled_path = os.path.join(unet_path, f"unet_bs{batch_size}_compile_quant_{args.height}x{args.width}.ts")
diff --git a/MindIE/MindIE-Torch/built-in/foundation/stable_diffusion_xl/stable_diffusionxl_pipeline.py b/MindIE/MindIE-Torch/built-in/foundation/stable_diffusion_xl/stable_diffusionxl_pipeline.py
index a35e44b5ac166b6729c3ad62d0b3735da28452e8..6d0b4d61c084c33455f9d0c331e1cf9f9bbb4be8 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/stable_diffusion_xl/stable_diffusionxl_pipeline.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/stable_diffusion_xl/stable_diffusionxl_pipeline.py
@@ -962,7 +962,7 @@ def main():
     if args.use_cache:
         flag_cache = 1
         for i in args.cache_steps.split(','):
-            if int(i) >= args.steps:
+            if not i.isdigit() or int(i) >= args.steps:
                 continue
             skip_steps[int(i)] = 1
 
@@ -1001,7 +1001,8 @@ def main():
                 flag_ddim=flag_ddim,
                 flag_cache=flag_cache,
             )
-        use_time += time.time() - start_time
+        if i > 4: # do not count the time spent inferring the first 0 to 4 images
+            use_time += time.time() - start_time
 
         for j in range(n_prompts):
             image_save_path = os.path.join(save_dir, f"{save_names[j]}.png")
@@ -1014,6 +1015,7 @@ def main():
 
             image_info[-1]['images'].append(image_save_path)
 
+    infer_num = infer_num - 5 # do not count the time spent inferring the first 5 images
     print(f"[info] infer number: {infer_num}; use time: {use_time:.3f}s\n"
           f"average time: {use_time / infer_num:.3f}s\n"
           f"clip time: {clip_time / infer_num:.3f}s\n"
diff --git a/MindIE/MindIE-Torch/built-in/foundation/stable_diffusion_xl/stable_diffusionxl_pipeline_cache_parallel.py b/MindIE/MindIE-Torch/built-in/foundation/stable_diffusion_xl/stable_diffusionxl_pipeline_cache_parallel.py
index b51ac4bfebd25423ae154e4ac0ee51575eb74255..cd024fb2503c8d51cb05ca31e97a9b8bc6261671 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/stable_diffusion_xl/stable_diffusionxl_pipeline_cache_parallel.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/stable_diffusion_xl/stable_diffusionxl_pipeline_cache_parallel.py
@@ -1004,7 +1004,8 @@ def main():
             flag_ddim=flag_ddim,
             flag_cache=flag_cache,
         )
-        use_time += time.time() - start_time
+        if i > 4: # do not count the time spent inferring the first 0 to 4 images
+            use_time += time.time() - start_time
 
         for j in range(n_prompts):
             image_save_path = os.path.join(save_dir, f"{save_names[j]}.png")
@@ -1017,6 +1018,7 @@ def main():
 
             image_info[-1]['images'].append(image_save_path)
 
+    infer_num = infer_num - 5 # do not count the time spent inferring the first 5 images
     print(f"[info] infer number: {infer_num}; use time: {use_time:.3f}s\n"
           f"average time: {use_time / infer_num:.3f}s\n"
           f"clip time: {clip_time / infer_num:.3f}s\n"