From 61f144abd048d1a49201ee7a97d6031b21a95cb5 Mon Sep 17 00:00:00 2001 From: lijian Date: Thu, 25 Sep 2025 12:09:38 +0800 Subject: [PATCH] show full result on input audio; add controls on librispeech performance and wer demo --- ACL_PyTorch/built-in/audio/whisperx/README.md | 11 ++- ACL_PyTorch/built-in/audio/whisperx/infer.py | 77 ++++++++++++------- .../built-in/audio/whisperx/run_wer_test.py | 2 +- 3 files changed, 57 insertions(+), 33 deletions(-) diff --git a/ACL_PyTorch/built-in/audio/whisperx/README.md b/ACL_PyTorch/built-in/audio/whisperx/README.md index 4c152b7cee..798f0f81ad 100644 --- a/ACL_PyTorch/built-in/audio/whisperx/README.md +++ b/ACL_PyTorch/built-in/audio/whisperx/README.md @@ -70,12 +70,13 @@ cd .. 文件目录结构大致如下: ```text -📁 whisper/ +📁 whisperx/ ├── check_numa.sh ├── audio.mp3 ├── infer.py ├── modeling_whisper.py ├── pipeline.py +├── 📁 LibriSpeech/ ├── 📁 patches/ | └── 📄 patch_apply.py | └── 📄 kaldi.patch @@ -134,11 +135,15 @@ infer.py推理参数: * --whisper_model_path:whisper模型权重路径,默认为"./weight/Whisper-large-v3/large-v3.pt" * --vad_model_path:vad模型权重路径,默认为"./weight/speech_fsmn_vad_zh-cn-16k-common-pytorch" * --audio_path:音频文件的路径,默认为"audio.mp3" +* --librispeech_perf_test:启用该参数时,将在LibriSpeech数据集的部分数据上进行性能测试,并输出结果及转录比。默认值为 True +* --skip_librispeech_perf_test:传入该参数时,跳过LibriSpeech数据集的性能测试 * --speech_path:librispeech dev clean数据集文件的路径,默认为"./LibriSpeech/dev-clean/" -* --num_audio_files:从librispeech dev clean数据集中选取部分音频文件做performance test,默认为52个,调整音频数量尽量让vad切分合并后的segment数接近但不大于batch size来达到最高性能。 +* --num_audio_files:从librispeech dev clean数据集中选取部分音频文件做性能测试,默认为52个,调整音频数量尽量让vad切分合并后的segment数接近但不大于batch size来达到最高性能 +* --librispeech_wer_demo:启用该参数时,将对一条LibriSpeech音频数据做转录,并计算wer。默认值为 True +* --skip_librispeech_wer_demo:启用该参数时,跳过wer精度demo * --device: npu设备编号,默认为0 * --batch_size: batch_size大小,默认为16 -* --warm_up:warm_up次数,默认为5 +* --warmup:warm up次数,默认为4,首次warm up时编译成图 ## 性能数据 infer.py取librispeech dev clean数据集中的部分音频进行转录,性能如下 diff --git a/ACL_PyTorch/built-in/audio/whisperx/infer.py b/ACL_PyTorch/built-in/audio/whisperx/infer.py index c80a6b3bc3..0069a4d02d 100644 --- a/ACL_PyTorch/built-in/audio/whisperx/infer.py +++ b/ACL_PyTorch/built-in/audio/whisperx/infer.py @@ -36,9 +36,20 @@ def parse_args(): help="vad model checkpoint file path") parser.add_argument("--audio_path", type=str, default="./audio.mp3", help="audio file path") + parser.add_argument("--librispeech_perf_test", dest="librispeech_perf_test", action="store_true", + help="conduct performance test on LibriSpeech dataset") + parser.add_argument("--skip_librispeech_perf_test", dest="librispeech_perf_test", action="store_false", + help="skip performance test on LibriSpeech dataset") + parser.set_defaults(librispeech_perf_test=True) parser.add_argument("--speech_path", type=str, default="./LibriSpeech/dev-clean/", help="librispeech dev clean english transaction speech data path") - parser.add_argument("--num_audio_files", type=int, default=52, help="num of audio files selected for performance test") + parser.add_argument("--num_audio_files", type=int, default=52, + help="num of audio files selected from LibriSpeech for performance test") + parser.add_argument("--librispeech_wer_demo", dest="librispeech_wer_demo", action="store_true", + help="perform librispeech wer demo") + parser.add_argument("--skip_librispeech_wer_demo", dest="librispeech_wer_demo", action="store_false", + help="skip librispeech wer demo") + parser.set_defaults(librispeech_wer_demo=True) parser.add_argument('--device', type=int, default='0', help="npu device id") parser.add_argument('--batch_size', type=int, default=16, help="batch size") parser.add_argument('--warmup', type=int, default=4, help="Warm up times") @@ -85,44 +96,52 @@ if __name__ == '__main__': whisper_decode_options=whisper_decode_options ) - y, audio_sr = librosa.load(args.audio_path) - duration_seconds = librosa.get_duration(y=y, sr=audio_sr) audio_sample = load_audio(args.audio_path) - data_path = f'{args.speech_path}/1919/142785' - audio_files = collect_audio_files([data_path])[:args.num_audio_files] - - def get_audio(audio_file): - return load_audio(audio_file) - - speech_data_list = list(map(get_audio, audio_files)) - speech_data = np.concatenate(speech_data_list) - - duration_seconds = 0 - for audio in audio_files: - y, audio_sr = librosa.load(audio) - duration_seconds += librosa.get_duration(y=y, sr=audio_sr) - with torch.inference_mode(): for _step in range(args.warmup): result = torchair_pipe.transcribe(audio_sample, batch_size=args.batch_size) print(f"warm up {_step}/{args.warmup} {result[0]['text']}") print(f"warm up success.") + # 对输入的audio进行推理 + y, audio_sr = librosa.load(args.audio_path) + duration_seconds = librosa.get_duration(y=y, sr=audio_sr) + print(f"perform inference on input audio: {args.audio_path}") t0 = time.time() - result = torchair_pipe.transcribe(speech_data, batch_size=args.batch_size) + result = torchair_pipe.transcribe(audio_sample, batch_size=args.batch_size) t1 = time.time() - print(f"transcription {result}") + print(f"transcription result: {result}") print(f"transcription ratio: {duration_seconds / (t1 - t0)}, speech durarations {duration_seconds}") + if args.librispeech_perf_test: + # 对Librispeech数据集中的音频进行推理 + data_path = f'{args.speech_path}/1919/142785' + audio_files = collect_audio_files([data_path])[:args.num_audio_files] + speech_data_list = list(map(load_audio, audio_files)) + speech_data = np.concatenate(speech_data_list) + duration_seconds = 0 + for audio in audio_files: + y, audio_sr = librosa.load(audio) + duration_seconds += librosa.get_duration(y=y, sr=audio_sr) + + print(f"------performance test on samples from LibriSpeech dataset-----") + t0 = time.time() + result = torchair_pipe.transcribe(speech_data, batch_size=args.batch_size) + t1 = time.time() + print(f"LibriSpeech transcription result: {result}") + print(f"transcription ratio: {duration_seconds / (t1 - t0)}, speech durarations {duration_seconds}") + # wer test - sample = load_audio(f'{args.speech_path}/1919/142785/1919-142785-0007.flac') - result = torchair_pipe.transcribe(sample, batch_size=args.batch_size) - - reference = "MODE CHOOSE THE GREENEST CUCUMBERS AND THOSE THAT ARE MOST FREE FROM SEEDS \ - PUT THEM IN STRONG SALT AND WATER WITH A CABBAGE LEAF TO KEEP THEM DOWN TIE A PAPER OVER \ - THEM AND PUT THEM IN A WARM PLACE TILL THEY ARE YELLOW THEN WASH THEM AND SET THEM OVER THE \ - FIRE IN FRESH WATER WITH A VERY LITTLE SALT AND ANOTHER CABBAGE LEAF OVER THEM COVER VERY CLOSELY BUT TAKE CARE THEY DO NOT BOIL" - - error_rate = check_wer(reference, result[0]['text']) - print(f"wer: {error_rate:.4f}") + if args.librispeech_wer_demo: + sample = load_audio(f'{args.speech_path}/1919/142785/1919-142785-0007.flac') + result = torchair_pipe.transcribe(sample, batch_size=args.batch_size) + + reference = "MODE CHOOSE THE GREENEST CUCUMBERS AND THOSE THAT ARE MOST FREE FROM SEEDS \ + PUT THEM IN STRONG SALT AND WATER WITH A CABBAGE LEAF TO KEEP THEM DOWN TIE A PAPER OVER \ + THEM AND PUT THEM IN A WARM PLACE TILL THEY ARE YELLOW THEN WASH THEM AND SET THEM OVER THE \ + FIRE IN FRESH WATER WITH A VERY LITTLE SALT AND ANOTHER CABBAGE LEAF OVER THEM COVER VERY CLOSELY BUT TAKE CARE THEY DO NOT BOIL" + + print(f"perform wer demo on a single audio sample from LibriSpeech dataset: {args.speech_path}/1919/142785/1919-142785-0007.flac") + error_rate = check_wer(reference, result[0]['text']) + print(f"wer: {error_rate:.4f}") diff --git a/ACL_PyTorch/built-in/audio/whisperx/run_wer_test.py b/ACL_PyTorch/built-in/audio/whisperx/run_wer_test.py index 6a0c7dd1bc..3fe56b2510 100644 --- a/ACL_PyTorch/built-in/audio/whisperx/run_wer_test.py +++ b/ACL_PyTorch/built-in/audio/whisperx/run_wer_test.py @@ -95,7 +95,7 @@ if __name__ == '__main__': args = parse_args() device = torch.device('npu:{}'.format(args.device)) - audio_txt_pairs = get_audio_txt_pairs(args.speech_data) + audio_txt_pairs = get_audio_txt_pairs(args.speech_path) whisper_decode_options = whisper.DecodingOptions(language='en', without_timestamps=True, fp16=True) torchair_pipe = TorchairPipeline( -- Gitee