diff --git a/speech/speech_recognition/efficient_conformer_v2_wenet/pytorch/README.md b/speech/speech_recognition/efficient_conformer_v2_wenet/pytorch/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f6b7ff96bcee8f36bf3162ec21c441693fc50a7f --- /dev/null +++ b/speech/speech_recognition/efficient_conformer_v2_wenet/pytorch/README.md @@ -0,0 +1,57 @@ +# Efficient Conformer V2 + +## Model description +EfficientFormerV2 mimics MobileNet with its convolutional structure, +offering transformers a series of designs and optimizations for mobile acceleration. +The number of parameters and latency of the model are critical for resource-constrained hardware, +so EfficientFormerV2 combines a fine-grained joint search strategy to propose an efficient network with low latency and size. + +## Step 1: Installation + +```bash +cd ../../../../toolbox/WeNet/ +bash install_toolbox_wenet.sh +``` + +## Step 2: Training + +Dataset is data_aishell.tgz and resource_aishell.tgz from wenet. +You could just run the whole script, which will download the dataset automatically. + +**You need to modify the path of the dataset in run.sh.** + +```bash +# Change to the scripts path +cd wenet/examples/aishell/s0/ + +# Configure data path and model name +export data_path="/path/to/aishell" +export model_name="u2++_efficonformer_v2" + +# Run all stages +bash run.sh --stage -1 --stop-stage 6 +``` + +Or you also run each stage one by one manually and check the result to understand the whole process. + +```bash +# Download data +bash run.sh --stage -1 --stop-stage -1 +# Prepare Training data +bash run.sh --stage 0 --stop-stage 0 +# Extract optinal cmvn features +bash run.sh --stage 1 --stop-stage 1 +# Generate label token dictionary +bash run.sh --stage 2 --stop-stage 2 +# Prepare WeNet data format +bash run.sh --stage 3 --stop-stage 3 +# Neural Network training +bash run.sh --stage 4 --stop-stage 4 +# Recognize wav using the trained model +bash run.sh --stage 5 --stop-stage 5 +# Export the trained model +bash run.sh --stage 6 --stop-stage 6 +``` + +## Reference +- [WeNet](https://github.com/wenet-e2e/wenet) diff --git a/toolbox/WeNet/patch/examples/aishell/s0/conf/train_u2++_conformer.yaml b/toolbox/WeNet/patch/examples/aishell/s0/conf/train_u2++_efficonformer_v2.yaml similarity index 64% rename from toolbox/WeNet/patch/examples/aishell/s0/conf/train_u2++_conformer.yaml rename to toolbox/WeNet/patch/examples/aishell/s0/conf/train_u2++_efficonformer_v2.yaml index b4587bce33be458b15490dccbf2f98aaa798959c..2a19c847ff6f660a7a3542dbe386dda49b24ce5f 100644 --- a/toolbox/WeNet/patch/examples/aishell/s0/conf/train_u2++_conformer.yaml +++ b/toolbox/WeNet/patch/examples/aishell/s0/conf/train_u2++_efficonformer_v2.yaml @@ -1,35 +1,40 @@ # network architecture # encoder related -encoder: conformer +encoder: efficientConformer encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks + activation_type: 'swish' + attention_heads: 8 + causal: false + cnn_module_kernel: 15 + cnn_module_norm: 'layer_norm' dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.1 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + input_layer: conv2d2 + linear_units: 2048 normalize_before: true - cnn_module_kernel: 8 - use_cnn_module: True - activation_type: 'swish' + num_blocks: 12 + output_size: 256 pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - causal: true + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.1 + use_cnn_module: true use_dynamic_chunk: true - cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster use_dynamic_left_chunk: false + efficient_conf: + stride_layer_idx: [3, 7] # layer id with StrideConv + stride: [2, 2] # stride size of each StrideConv + group_layer_idx: [3, 7] # layer id with GroupedAttention + group_size: 3 # group size of every GroupedAttention layer + stride_kernel: false # true: recompute cnn kernels with stride # decoder related decoder: bitransformer decoder_conf: - attention_heads: 4 + attention_heads: 8 + dropout_rate: 0.1 linear_units: 2048 num_blocks: 3 - r_num_blocks: 3 - dropout_rate: 0.1 positional_dropout_rate: 0.1 + r_num_blocks: 3 self_attention_dropout_rate: 0.1 src_attention_dropout_rate: 0.1 @@ -40,20 +45,31 @@ model_conf: length_normalized_loss: false reverse_weight: 0.3 +# dataset related dataset_conf: + batch_conf: + batch_size: 16 + batch_type: 'static' + fbank_conf: + dither: 1.0 + frame_length: 25 + frame_shift: 10 + num_mel_bins: 80 filter_conf: max_length: 40960 min_length: 0 + max_output_input_ratio: 0.1 + min_output_input_ratio: 0.005 token_max_length: 200 token_min_length: 1 resample_conf: resample_rate: 16000 - speed_perturb: true - fbank_conf: - num_mel_bins: 80 - frame_shift: 10 - frame_length: 25 - dither: 1.0 + shuffle: true + shuffle_conf: + shuffle_size: 1500 + sort: true + sort_conf: + sort_size: 500 spec_aug: true spec_aug_conf: num_t_mask: 2 @@ -67,19 +83,11 @@ dataset_conf: spec_trim: false spec_trim_conf: max_t: 50 - shuffle: true - shuffle_conf: - shuffle_size: 1500 - sort: true - sort_conf: - sort_size: 500 # sort_size should be less than shuffle_size - batch_conf: - batch_type: 'static' # static or dynamic - batch_size: 16 + speed_perturb: true grad_clip: 5 accum_grad: 1 -max_epoch: 360 +max_epoch: 200 log_interval: 100 optim: adam @@ -87,4 +95,4 @@ optim_conf: lr: 0.001 scheduler: warmuplr # pytorch v1.1.0+ required scheduler_conf: - warmup_steps: 25000 + warmup_steps: 25000 \ No newline at end of file