From 61f144abd048d1a49201ee7a97d6031b21a95cb5 Mon Sep 17 00:00:00 2001
From: lijian <lijian379@huawei.com>
Date: Thu, 25 Sep 2025 12:09:38 +0800
Subject: [PATCH] show full result on input audio; add controls on librispeech
 performance and wer demo

---
 ACL_PyTorch/built-in/audio/whisperx/README.md | 11 ++-
 ACL_PyTorch/built-in/audio/whisperx/infer.py  | 77 ++++++++++++-------
 .../built-in/audio/whisperx/run_wer_test.py   |  2 +-
 3 files changed, 57 insertions(+), 33 deletions(-)

diff --git a/ACL_PyTorch/built-in/audio/whisperx/README.md b/ACL_PyTorch/built-in/audio/whisperx/README.md
index 4c152b7cee..798f0f81ad 100644
--- a/ACL_PyTorch/built-in/audio/whisperx/README.md
+++ b/ACL_PyTorch/built-in/audio/whisperx/README.md
@@ -70,12 +70,13 @@ cd ..
 文件目录结构大致如下：
 
 ```text
-📁 whisper/
+📁 whisperx/
 ├── check_numa.sh
 ├── audio.mp3
 ├── infer.py
 ├── modeling_whisper.py
 ├── pipeline.py
+├── 📁 LibriSpeech/
 ├── 📁 patches/
 |   └── 📄 patch_apply.py
 |   └── 📄 kaldi.patch
@@ -134,11 +135,15 @@ infer.py推理参数：
 * --whisper_model_path：whisper模型权重路径，默认为"./weight/Whisper-large-v3/large-v3.pt"
 * --vad_model_path：vad模型权重路径，默认为"./weight/speech_fsmn_vad_zh-cn-16k-common-pytorch"
 * --audio_path：音频文件的路径，默认为"audio.mp3"
+* --librispeech_perf_test：启用该参数时，将在LibriSpeech数据集的部分数据上进行性能测试，并输出结果及转录比。默认值为 True
+* --skip_librispeech_perf_test：传入该参数时，跳过LibriSpeech数据集的性能测试
 * --speech_path：librispeech dev clean数据集文件的路径，默认为"./LibriSpeech/dev-clean/"
-* --num_audio_files：从librispeech dev clean数据集中选取部分音频文件做performance test，默认为52个，调整音频数量尽量让vad切分合并后的segment数接近但不大于batch size来达到最高性能。
+* --num_audio_files：从librispeech dev clean数据集中选取部分音频文件做性能测试，默认为52个，调整音频数量尽量让vad切分合并后的segment数接近但不大于batch size来达到最高性能
+* --librispeech_wer_demo：启用该参数时，将对一条LibriSpeech音频数据做转录，并计算wer。默认值为 True
+* --skip_librispeech_wer_demo：启用该参数时，跳过wer精度demo
 * --device: npu设备编号，默认为0
 * --batch_size: batch_size大小，默认为16
-* --warm_up：warm_up次数，默认为5
+* --warmup：warm up次数，默认为4，首次warm up时编译成图
 
 ## 性能数据
   infer.py取librispeech dev clean数据集中的部分音频进行转录，性能如下
diff --git a/ACL_PyTorch/built-in/audio/whisperx/infer.py b/ACL_PyTorch/built-in/audio/whisperx/infer.py
index c80a6b3bc3..0069a4d02d 100644
--- a/ACL_PyTorch/built-in/audio/whisperx/infer.py
+++ b/ACL_PyTorch/built-in/audio/whisperx/infer.py
@@ -36,9 +36,20 @@ def parse_args():
                         help="vad model checkpoint file path")
     parser.add_argument("--audio_path", type=str, default="./audio.mp3",
                         help="audio file path")
+    parser.add_argument("--librispeech_perf_test", dest="librispeech_perf_test", action="store_true",
+                        help="conduct performance test on LibriSpeech dataset")
+    parser.add_argument("--skip_librispeech_perf_test", dest="librispeech_perf_test", action="store_false",
+                        help="skip performance test on LibriSpeech dataset")
+    parser.set_defaults(librispeech_perf_test=True)
     parser.add_argument("--speech_path", type=str, default="./LibriSpeech/dev-clean/",
                         help="librispeech dev clean english transaction speech data path")
-    parser.add_argument("--num_audio_files", type=int, default=52, help="num of audio files selected for performance test")
+    parser.add_argument("--num_audio_files", type=int, default=52,
+                        help="num of audio files selected from LibriSpeech for performance test")
+    parser.add_argument("--librispeech_wer_demo", dest="librispeech_wer_demo", action="store_true",
+                        help="perform librispeech wer demo")
+    parser.add_argument("--skip_librispeech_wer_demo", dest="librispeech_wer_demo", action="store_false",
+                        help="skip librispeech wer demo")
+    parser.set_defaults(librispeech_wer_demo=True)
     parser.add_argument('--device', type=int, default='0', help="npu device id")
     parser.add_argument('--batch_size', type=int, default=16, help="batch size")
     parser.add_argument('--warmup', type=int, default=4, help="Warm up times")
@@ -85,44 +96,52 @@ if __name__ == '__main__':
         whisper_decode_options=whisper_decode_options
     )
 
-    y, audio_sr = librosa.load(args.audio_path)
-    duration_seconds = librosa.get_duration(y=y, sr=audio_sr)
     audio_sample = load_audio(args.audio_path)
 
-    data_path = f'{args.speech_path}/1919/142785'
-    audio_files = collect_audio_files([data_path])[:args.num_audio_files]
-    
-    def get_audio(audio_file):
-        return load_audio(audio_file)
-    
-    speech_data_list = list(map(get_audio, audio_files))
-    speech_data = np.concatenate(speech_data_list)
-
-    duration_seconds = 0
-    for audio in audio_files:
-        y, audio_sr = librosa.load(audio)
-        duration_seconds += librosa.get_duration(y=y, sr=audio_sr)
-
     with torch.inference_mode():
         for _step in range(args.warmup):
             result = torchair_pipe.transcribe(audio_sample, batch_size=args.batch_size)
             print(f"warm up {_step}/{args.warmup} {result[0]['text']}")
         print(f"warm up success.")
 
+        # 对输入的audio进行推理
+        y, audio_sr = librosa.load(args.audio_path)
+        duration_seconds = librosa.get_duration(y=y, sr=audio_sr)
+        print(f"perform inference on input audio: {args.audio_path}")
         t0 = time.time()
-        result = torchair_pipe.transcribe(speech_data, batch_size=args.batch_size)
+        result = torchair_pipe.transcribe(audio_sample, batch_size=args.batch_size)
         t1 = time.time()
-        print(f"transcription {result}")
+        print(f"transcription result: {result}")
         print(f"transcription ratio: {duration_seconds / (t1 - t0)}, speech durarations {duration_seconds}")
 
+        if args.librispeech_perf_test:
+            # 对Librispeech数据集中的音频进行推理
+            data_path = f'{args.speech_path}/1919/142785'
+            audio_files = collect_audio_files([data_path])[:args.num_audio_files]
+            speech_data_list = list(map(load_audio, audio_files))
+            speech_data = np.concatenate(speech_data_list)
+            duration_seconds = 0
+            for audio in audio_files:
+                y, audio_sr = librosa.load(audio)
+                duration_seconds += librosa.get_duration(y=y, sr=audio_sr)
+
+            print(f"------performance test on samples from LibriSpeech dataset-----")
+            t0 = time.time()
+            result = torchair_pipe.transcribe(speech_data, batch_size=args.batch_size)
+            t1 = time.time()
+            print(f"LibriSpeech transcription result: {result}")
+            print(f"transcription ratio: {duration_seconds / (t1 - t0)}, speech durarations {duration_seconds}")
+
         # wer test
-        sample = load_audio(f'{args.speech_path}/1919/142785/1919-142785-0007.flac')
-        result = torchair_pipe.transcribe(sample, batch_size=args.batch_size)
-        
-        reference = "MODE CHOOSE THE GREENEST CUCUMBERS AND THOSE THAT ARE MOST FREE FROM SEEDS \
-                PUT THEM IN STRONG SALT AND WATER WITH A CABBAGE LEAF TO KEEP THEM DOWN TIE A PAPER OVER \
-                THEM AND PUT THEM IN A WARM PLACE TILL THEY ARE YELLOW THEN WASH THEM AND SET THEM OVER THE \
-                FIRE IN FRESH WATER WITH A VERY LITTLE SALT AND ANOTHER CABBAGE LEAF OVER THEM COVER VERY CLOSELY BUT TAKE CARE THEY DO NOT BOIL"
-        
-        error_rate = check_wer(reference, result[0]['text'])
-        print(f"wer: {error_rate:.4f}")
+        if args.librispeech_wer_demo:
+            sample = load_audio(f'{args.speech_path}/1919/142785/1919-142785-0007.flac')
+            result = torchair_pipe.transcribe(sample, batch_size=args.batch_size)
+            
+            reference = "MODE CHOOSE THE GREENEST CUCUMBERS AND THOSE THAT ARE MOST FREE FROM SEEDS \
+                    PUT THEM IN STRONG SALT AND WATER WITH A CABBAGE LEAF TO KEEP THEM DOWN TIE A PAPER OVER \
+                    THEM AND PUT THEM IN A WARM PLACE TILL THEY ARE YELLOW THEN WASH THEM AND SET THEM OVER THE \
+                    FIRE IN FRESH WATER WITH A VERY LITTLE SALT AND ANOTHER CABBAGE LEAF OVER THEM COVER VERY CLOSELY BUT TAKE CARE THEY DO NOT BOIL"
+            
+            print(f"perform wer demo on a single audio sample from LibriSpeech dataset: {args.speech_path}/1919/142785/1919-142785-0007.flac")
+            error_rate = check_wer(reference, result[0]['text'])
+            print(f"wer: {error_rate:.4f}")
diff --git a/ACL_PyTorch/built-in/audio/whisperx/run_wer_test.py b/ACL_PyTorch/built-in/audio/whisperx/run_wer_test.py
index 6a0c7dd1bc..3fe56b2510 100644
--- a/ACL_PyTorch/built-in/audio/whisperx/run_wer_test.py
+++ b/ACL_PyTorch/built-in/audio/whisperx/run_wer_test.py
@@ -95,7 +95,7 @@ if __name__ == '__main__':
     args = parse_args()
     device = torch.device('npu:{}'.format(args.device))
 
-    audio_txt_pairs = get_audio_txt_pairs(args.speech_data)
+    audio_txt_pairs = get_audio_txt_pairs(args.speech_path)
     whisper_decode_options = whisper.DecodingOptions(language='en', without_timestamps=True, fp16=True)
 
     torchair_pipe = TorchairPipeline(
-- 
Gitee